Merge pull request #282 from 4dn-dcic/kmp_sheet_utils_with_vapp

Add portal_vapp= functionality to sheet_utils
4dn-dcic · Sep 7, 2023 · 295adfe · 295adfe
2 parents e09af07 + 5a07b69
commit 295adfe
Show file tree

Hide file tree

Showing 8 changed files with 305 additions and 42 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,7 +7,7 @@ Change Log
 ----------
 
 
-7.10.0
+7.11.0
 ======
 
 * New module ``sheet_utils`` for loading workbooks.
@@ -34,6 +34,23 @@ Change Log
   * New class ``JsonLinesReader``
 
 
+7.10.0
+======
+
+* In ``ff_utils``:
+
+  * New arguments ``portal_env=`` and ``portal_vapp`` to ``get_schema``
+    for function ``get_schema`` and ``get_schemas``.
+
+* In ``s3_utils``:
+
+  * Fix a failing test (caused by an environmental change, no functional change).
+
+* In ``license_utils``:
+
+  * Allow C4 infrastructure to use the ``chardet`` library.
+
+
 7.9.0
 =====
 

diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py
@@ -8,7 +8,7 @@
 
 from collections import namedtuple
 from elasticsearch.exceptions import AuthorizationException
-from typing import Optional, Dict, List
+from typing import Dict, List, Optional
 from urllib.parse import parse_qs, urlencode, urlparse, urlunparse
 from . import s3_utils, es_utils
 from .common import (
@@ -17,7 +17,7 @@
     # S3BucketName, S3KeyName,
 )
 from .lang_utils import disjoined_list
-from .misc_utils import PRINT, to_camel_case, remove_suffix
+from .misc_utils import PRINT, to_camel_case, remove_suffix, VirtualApp
 
 
 # TODO (C4-92, C4-102): Probably to centralize this information in env_utils. Also figure out relation to CGAP.
@@ -419,7 +419,7 @@ def search_result_generator(page_generator):
     but where a page size of 3 is used with start position 0. That call will return A,C,E. The
     user may expect G,I on the second page, but before it can be done, suppose an element D is
     indexed and that the stored data is A,C,D,E,G,I,K,M. Requesting data from start position 0 would
-    now return A,C,D but we already had the first page, so we request data starting at position 3
+    now return A,C,D, but we already had the first page, so we request data starting at position 3
     for the second page and get E,G,I.  That means our sequence of return values would be A,C,E,E,G,I,K,M,
     or, in other words, showing a duplication. To avoid this, we keep track of the IDs we've seen
     and show only the first case of each element, so A,C,E,G,I,K,M. (We won't see the D, but we weren't
@@ -647,7 +647,7 @@ def get_associated_qc_metrics(uuid, key=None, ff_env=None, include_processed_fil
                               include_raw_files=False,
                               include_supplementary_files=False):
     """
-    Given a uuid of an experimentSet return a dictionary of dictionaries with each dictionary
+    Given a UUID of an experimentSet return a dictionary of dictionaries with each dictionary
     representing a quality metric.
 
     Args:
@@ -942,41 +942,90 @@ def _get_es_metadata(uuids, es_client, filters, sources, chunk_size, auth):
                 yield hit['_source']  # yield individual items from ES
 
 
-def get_schema(name, key=None, ff_env=None) -> Dict:
+def resolve_portal_env(ff_env: Optional[str], portal_env: Optional[str],
+                       portal_vapp: Optional[VirtualApp]) -> Optional[str]:
+    """
+    Resolves which of ff_env and portal_env to use (after doing consistency checking).
+    There are two consistency checks performed, for which an error is raised on failure:
+        1. If neither ff_env= and portal_env= is None, the values must be compatible.
+        2. If either ff_env= or portal_env= is not None, portal_vapp= must be None.
+
+    The intent is that callers will do:
+        portal_env = resolve_portal_env(ff_env=ff_env, portal_env=portal_env, portal_vapp=portal_vapp)
+    and then afterward not have to worry that arguments are inconsistent.
+
+    Args:
+        ff_env:      an environment name or None
+        portal_env:  an environment name or None
+        portal_vapp: a VirtualApp or None
+    """
+    if ff_env:
+        if portal_env and portal_env != ff_env:
+            raise ValueError("You may not supply both portal_env= and ff_env= together.")
+        portal_env = ff_env
+    if portal_env and portal_vapp:
+        env_arg_name = 'ff_env=' if ff_env else 'portal_env='
+        raise ValueError(f"You may not supply both portal_vapp= and {env_arg_name} together.")
+    return portal_env
+
+
+def get_schema(name, key=None, ff_env: Optional[str] = None, portal_env: Optional[str] = None,
+               portal_vapp: Optional[VirtualApp] = None) -> Dict:
     """
     Gets the schema definition with the given name.
 
+    Only one of portal_env= (or ff_env=) or portal_vapp= can be provided. This determines how the schemas are obtained.
+
     Args:
         name (str):   a schema name (CamelCase or snake_case), or None
         key (dict):   standard ff_utils authentication key
-        ff_env (str): standard ff environment string
+        ff_env (str): standard environment string (deprecated, please prefer portal_env=)
+        portal_env:   standard environment string (compatible replacement for ff_env=)
+        portal_vapp:  a VirtualApp or None
 
     Returns:
         dict: contains key schema names and value item class names
     """
-    auth = get_authentication_with_server(key, ff_env)
-    url = f"profiles/{to_camel_case(name)}.json"
-    schema = get_metadata(url, key=auth, add_on='frame=raw')
-    return schema
+    portal_env = resolve_portal_env(ff_env=ff_env, portal_env=portal_env, portal_vapp=portal_vapp)
+    base_url = f"profiles/{to_camel_case(name)}.json"
+    add_on = 'frame=raw'
+    if portal_vapp:
+        full_url = f"{base_url}?{add_on}"
+        res = portal_vapp.get(full_url)
+        return get_response_json(res)
+    else:
+        schema = get_metadata(obj_id=base_url, key=key, ff_env=portal_env, add_on=add_on)
+        return schema
 
 
-def get_schemas(key=None, ff_env=None, *, allow_abstract=True, require_id=False) -> Dict[str, Dict]:
+def get_schemas(key=None, ff_env: Optional[str] = None, *, allow_abstract: bool = True, require_id: bool = False,
+                portal_env: Optional[str] = None, portal_vapp: Optional[VirtualApp] = None) -> Dict[str, Dict]:
     """
     Gets a dictionary of all schema definitions.
     By default, this returns all schemas, but the allow_abstract= and require_id= keywords allow limited filtering.
 
+    Only one of portal_env= (or ff_env=) or portal_vapp= can be provided. This determines how the schemas are obtained.
+
     Args:
         key (dict):               standard ff_utils authentication key
-        ff_env (str):             standard ff environment string
+        ff_env (str):             standard environment string (deprecated, please prefer portal_env=)
+        portal_env:               standard environment string (compatible replacement for ff_env=)
+        portal_vapp:              a VirtualApp or None
         allow_abstract (boolean): controls whether abstract schemas can be returned (default True, return them)
         require_id (boolean):     controls whether a '$id' field is required for schema to be included
                                   (default False, include even if no $id)
 
     Returns:
         dict: a mapping from keys that are schema names to schema definitions
     """
-    auth = get_authentication_with_server(key, ff_env)
-    schemas: Dict[str, Dict] = get_metadata('profiles/', key=auth, add_on='frame=raw')
+    portal_env = resolve_portal_env(ff_env=ff_env, portal_env=portal_env, portal_vapp=portal_vapp)
+    base_url = 'profiles/'
+    add_on = 'frame=raw'
+    if portal_vapp:
+        full_url = f"{base_url}?{add_on}"
+        schemas: Dict[str, Dict] = portal_vapp.get(full_url)
+    else:
+        schemas: Dict[str, Dict] = get_metadata(obj_id=base_url, key=key, ff_env=portal_env, add_on=add_on)
     filtered_schemas = {}
     for schema_name, schema in schemas.items():
         if allow_abstract or not schema.get('isAbstract'):

diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py
@@ -192,7 +192,11 @@ class _VirtualAppHelper(webtest.TestApp):
     pass
 
 
-class VirtualApp:
+class AbstractVirtualApp:
+    pass
+
+
+class VirtualApp(AbstractVirtualApp):
     """
     Wrapper class for TestApp, to allow custom control over submitting Encoded requests,
     simulating a number of conditions, including permissions.

diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py
@@ -11,16 +11,16 @@
 import uuid
 import yaml
 
-from dcicutils.common import AnyJsonData
-from dcicutils.env_utils import public_env_name, EnvUtils
-from dcicutils.ff_utils import get_schema
-from dcicutils.lang_utils import conjoined_list, disjoined_list, maybe_pluralize, there_are
-from dcicutils.misc_utils import ignored, PRINT, pad_to, JsonLinesReader
-from dcicutils.task_utils import pmap
 from openpyxl.worksheet.worksheet import Worksheet
 from openpyxl.workbook.workbook import Workbook
 from tempfile import TemporaryFile
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union
+from .common import AnyJsonData
+from .env_utils import public_env_name, EnvUtils
+from .ff_utils import get_schema
+from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize, there_are
+from .misc_utils import ignored, PRINT, pad_to, JsonLinesReader, AbstractVirtualApp
+from .task_utils import pmap
 
 
 Header = str
@@ -614,32 +614,34 @@ class SchemaAutoloadMixin(AbstractTableSetManager):
     AUTOLOAD_SCHEMAS_DEFAULT = True
 
     def __init__(self, filename: str, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None,
-                 **kwargs):
+                 portal_vapp: Optional[AbstractVirtualApp] = None, **kwargs):
         # This setup must be in place before the class initialization is done (via the super call).
         self.autoload_schemas: bool = self.AUTOLOAD_SCHEMAS_DEFAULT if autoload_schemas is None else autoload_schemas
-        if self.autoload_schemas:
-            if portal_env is None:
+        if self.autoload_schemas:  # If autoload_schemas is False, we don't care about doing this defaulting.
+            if portal_env is None and portal_vapp is None:
                 portal_env = public_env_name(EnvUtils.PRD_ENV_NAME)
                 PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.")
         self.portal_env: Optional[str] = portal_env
+        self.portal_vapp: Optional[AbstractVirtualApp] = portal_vapp
         super().__init__(filename=filename, **kwargs)
 
     def fetch_relevant_schemas(self, schema_names: List[str]):
         # The schema_names argument is not normally given, but it is there for easier testing
         def fetch_schema(schema_name):
-            schema = self.fetch_schema(schema_name, portal_env=self.portal_env)
+            schema = self.fetch_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp)
             return schema_name, schema
-        if self.autoload_schemas and self.portal_env:
+        if self.autoload_schemas and (self.portal_env or self.portal_vapp):
             autoloaded = {tab_name: schema
                           for tab_name, schema in pmap(fetch_schema, schema_names)}
             return autoloaded
         else:
             return {}
 
     @classmethod
-    def fetch_schema(cls, schema_name: str, *, portal_env: str):
+    def fetch_schema(cls, schema_name: str, *, portal_env: Optional[str] = None,
+                     portal_vapp: Optional[AbstractVirtualApp] = None):
         def just_fetch_it():
-            return get_schema(schema_name, ff_env=portal_env)
+            return get_schema(schema_name, portal_env=portal_env, portal_vapp=portal_vapp)
         if cls.CACHE_SCHEMAS:
             schema: Optional[AnyJsonData] = cls.SCHEMA_CACHE.get(schema_name)
             if schema is None:
@@ -665,9 +667,16 @@ def __init__(self, filename: str, schemas: Optional[Dict[str, AnyJsonData]] = No
         self.patch_prototypes_by_tab_name: Dict[str, Dict] = {}
         self.parsed_headers_by_tab_name: Dict[str, ParsedHeaders] = {}
         self.type_hints_by_tab_name: Dict[str, OptionalTypeHints] = {}
-        self.schemas = schemas or self.fetch_relevant_schemas(self.tab_names)
+        self._schemas = schemas
         self._instaguid_context_table: Dict[str, str] = {}
 
+    @property
+    def schemas(self):
+        schemas = self._schemas
+        if schemas is None:
+            self._schemas = schemas = self.fetch_relevant_schemas(self.tab_names)
+        return schemas
+
     def sheet_patch_prototype(self, tab_name: str) -> Dict:
         return self.patch_prototypes_by_tab_name[tab_name]
 
@@ -841,12 +850,18 @@ class SimpleYamlInsertsManager(SimpleInsertsMixin, YamlInsertsMixin, InsertsMana
 
 
 class InsertsItemMixin(AbstractItemManager):  # ItemManagerMixin isn't really appropriate here
+    """
+    This class is used for inserts directories and other JSON-like data that will be literally used as an Item
+    without semantic pre-processing. In other words, these classes will not be pre-checked for semantic correctness
+    but instead assumed to have been checked by other means.
+    """
 
     AUTOLOAD_SCHEMAS_DEFAULT = False  # Has no effect, but someone might inspect the value.
 
     def __init__(self, filename: str, *, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None,
-                 schemas: Optional[Dict[str, AnyJsonData]] = None, **kwargs):
-        ignored(portal_env)  # Would only be used if autoload_schemas was requested, and we don't allow that.
+                 portal_vapp: Optional[AbstractVirtualApp] = None, schemas: Optional[Dict[str, AnyJsonData]] = None,
+                 **kwargs):
+        ignored(portal_env, portal_vapp)  # Would only be used if autoload_schemas was true, and we don't allow that.
         if schemas not in [None, {}]:
             raise ValueError(f"{self.__class__.__name__} does not allow schemas={schemas!r}.")
         if autoload_schemas not in [None, False]:
@@ -1038,12 +1053,24 @@ def create_implementation_manager(cls, filename: str, **kwargs) -> AbstractItemM
     @classmethod
     def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None,
              schemas: Optional[Dict] = None, autoload_schemas: Optional[bool] = None,
+             portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None,
              **kwargs) -> TabbedSheetData:
         """
-        Given a filename and various options
+        Given a filename and various options, loads the items associated with that filename.
+
+        :param filename: The name of the file to load.
+        :param tab_name: For files that lack multiple tabs (such as .csv or .tsv),
+            the tab name to associate with the data.
+        :param escaping: Whether to perform escape processing on backslashes.
+        :param schemas: A set of schemas to use instead of trying to load them.
+        :param autoload_schemas: Whether to try autoloading schemas.
+        :param portal_env: A portal to consult to find schemas (usually if calling from the outside of a portal).
+        :param portal_vapp: A vapp to use (usually if calling from within a portal).
         """
         manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping,
-                                                    schemas=schemas, autoload_schemas=autoload_schemas, **kwargs)
+                                                    schemas=schemas, autoload_schemas=autoload_schemas,
+                                                    portal_env=portal_env, portal_vapp=portal_vapp,
+                                                    **kwargs)
         return manager.load_content()
 
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dcicutils"
-version = "7.9.0.1b5"  # to become "7.10.0"
+version = "7.10.0.1b7"  # to become "7.11.0"
 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources"
 authors = ["4DN-DCIC Team <support@4dnucleome.org>"]
 license = "MIT"