From ada2c8fadad4abd008d1aa0cf32de5f8bf989ef6 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Tue, 7 Nov 2023 16:14:31 -0500
Subject: [PATCH 01/49] Changes to bundle_utils/etc related to SMaHT ingestion
 work.

---
 dcicutils/bundle_utils.py     | 42 +++++++++++++++++++++++++++++++++--
 dcicutils/validation_utils.py | 18 +++++++++------
 2 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py
index fb76c74a5..a55b37e3a 100644
--- a/dcicutils/bundle_utils.py
+++ b/dcicutils/bundle_utils.py
@@ -15,6 +15,7 @@
 
 PatchPrototype = Dict
 TabbedPatchPrototypes = Dict[str, PatchPrototype]
+ARRAY_VALUE_DELIMITER = "|"
 
 
 class TypeHintContext:
@@ -115,6 +116,20 @@ def apply_hint(self, value):
         return super().apply_hint(value)
 
 
+class ArrayHint(TypeHint):
+    def apply_hint(self, value):
+        if value is None or value == '':
+            return []
+        if isinstance(value, str):
+            return [value.strip() for value in value.split(ARRAY_VALUE_DELIMITER)]
+        return super().apply_hint(value)
+
+
+class StringHint(TypeHint):
+    def apply_hint(self, value):
+        return str(value).strip() if value is not None else ""
+
+
 class RefHint(TypeHint):
 
     def __str__(self):
@@ -288,8 +303,11 @@ def parse_item_value(cls, value: SheetCellValue,
                 return True
             elif lvalue == 'false':
                 return False
-            elif lvalue == 'null' or lvalue == '':
+            # elif lvalue == 'null' or lvalue == '':
+            elif lvalue == 'null':
                 return None
+            elif lvalue == '':
+                return lvalue
             elif split_pipe and '|' in value:
                 if value == '|':  # Use '|' for []
                     return []
@@ -307,7 +325,8 @@ def parse_item_value(cls, value: SheetCellValue,
 
     @classmethod
     def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False):
-        if (value is None or value == '') and not force:
+        # if (value is None or value == '') and not force:
+        if value is None and not force:
             return
         [key, *more_path] = path
         if not more_path:
@@ -327,10 +346,13 @@ def find_type_hint_for_subschema(cls, subschema: Any, context: Optional[TypeHint
                 link_to = subschema.get('linkTo')
                 if link_to and context.schema_exists(link_to):
                     return RefHint(schema_name=link_to, context=context)
+                return StringHint()
             elif t in ('integer', 'float', 'number'):
                 return NumHint(declared_type=t)
             elif t == 'boolean':
                 return BoolHint()
+            elif t == 'array':
+                return ArrayHint()
 
     @classmethod
     def find_type_hint_for_parsed_header(cls, parsed_header: Optional[ParsedHeader], schema: Any,
@@ -641,6 +663,7 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional
                # TODO: validate= is presently False (i.e., disabled) by default while being debugged,
                #       but for production use maybe should not be? -kmp 25-Oct-2023
                validate: bool = False,
+               retain_empty_properties: bool = False,
                **kwargs):
     annotated_data = TableSetManager.load_annotated(filename=filename, tab_name=tab_name, escaping=escaping,
                                                     prefer_number=False, **kwargs)
@@ -655,8 +678,23 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional
         # No fancy checking for things like .json, etc. for now. Only check things that came from
         # spreadsheet-like data, where structural datatypes are forced into strings.
         checked_items = tabbed_rows
+    if not retain_empty_properties:
+        remove_empty_properties(checked_items)
     if validate:
         problems = validate_data_against_schemas(checked_items, portal_env=portal_env, portal_vapp=portal_vapp,
                                                  override_schemas=override_schemas)
         return checked_items, problems
     return checked_items
+
+
+def remove_empty_properties(data: Optional[Union[list,dict]]) -> None:
+    if isinstance(data, dict):
+        for key in list(data.keys()):
+            value = data[key]
+            if not value:
+                del data[key]
+            else:
+                remove_empty_properties(value)
+    elif isinstance(data, list):
+        for item in data:
+            remove_empty_properties(item)
diff --git a/dcicutils/validation_utils.py b/dcicutils/validation_utils.py
index 9e80146a0..35014a3cf 100644
--- a/dcicutils/validation_utils.py
+++ b/dcicutils/validation_utils.py
@@ -87,15 +87,19 @@ def identifying_properties(self, schema: Optional[JsonSchema] = None, schema_nam
 
     @classmethod  # This operation doesn't actually use the schemas so is safe as a class method
     def identifying_value(cls, data_item: Dict[str, AnyJsonData], identifying_properties) -> AnyJsonData:
-        if not identifying_properties:
-            raise ValueError("No identifying properties were specified.")
-        for identifying_property in identifying_properties:
+        for identifying_property in identifying_properties or []:
             if identifying_property in data_item:
                 return data_item[identifying_property]
-        raise ValueError(f'{there_are(identifying_properties, just_are=True)}'
-                         f' no {maybe_pluralize(identifying_properties, "identifying property")}'
-                         f' {disjoined_list([repr(x) for x in identifying_properties])}'
-                         f' in {json.dumps(data_item)}.')
+        if False:
+            if not identifying_properties:
+                raise ValueError("No identifying properties were specified.")
+            for identifying_property in identifying_properties:
+                if identifying_property in data_item:
+                    return data_item[identifying_property]
+            raise ValueError(f'{there_are(identifying_properties, just_are=True)}'
+                             f' no {maybe_pluralize(identifying_properties, "identifying property")}'
+                             f' {disjoined_list([repr(x) for x in identifying_properties])}'
+                             f' in {json.dumps(data_item)}.')
 
     @staticmethod
     def get_identifying_properties(schema: dict) -> list:

From cc03d5201452cdf72eaa8a54c915b1b26a764f90 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Tue, 7 Nov 2023 16:16:18 -0500
Subject: [PATCH 02/49] Changes to bundle_utils/etc related to SMaHT ingestion
 work.

---
 CHANGELOG.rst  | 6 ++++++
 pyproject.toml | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index c50a6d38a..99b3dc103 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -6,6 +6,12 @@ dcicutils
 Change Log
 ----------
 
+
+8.3.0
+======
+* More work related to SMaHT ingestion.
+
+
 8.2.0
 =====
 * 2023-11-02
diff --git a/pyproject.toml b/pyproject.toml
index 5c3863c3a..122ce5164 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dcicutils"
-version = "8.2.0"
+version = "8.2.0.1b1"
 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources"
 authors = ["4DN-DCIC Team <support@4dnucleome.org>"]
 license = "MIT"

From 7beaa3d34bcf4d1055c3f8e8c4c1f92e80af1e4a Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Wed, 8 Nov 2023 13:10:20 -0500
Subject: [PATCH 03/49] Changes to bundle_utils/etc related to SMaHT ingestion
 work.

---
 dcicutils/sheet_utils.py      | 92 ++++++++++++++++++++++++++++++-----
 dcicutils/validation_utils.py |  1 +
 2 files changed, 82 insertions(+), 11 deletions(-)

diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py
index 66ce523fa..8411c3cb2 100644
--- a/dcicutils/sheet_utils.py
+++ b/dcicutils/sheet_utils.py
@@ -16,8 +16,8 @@
 from tempfile import TemporaryFile, TemporaryDirectory
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union
 from .common import AnyJsonData, Regexp, JsonSchema
-from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize  # , there_are
-from .misc_utils import ignored, pad_to, JsonLinesReader, remove_suffix  # , PRINT, AbstractVirtualApp
+from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize
+from .misc_utils import ignored, pad_to, JsonLinesReader, remove_suffix, to_snake_case
 
 
 Header = str
@@ -32,6 +32,7 @@
 SheetData = List[dict]
 TabbedSheetData = Dict[str, SheetData]
 TabbedJsonSchemas = Dict[str, JsonSchema]
+CommentRow = object()
 
 
 class LoadFailure(Exception):
@@ -296,12 +297,19 @@ def load_content(self) -> AnyJsonData:
             state = self._create_tab_processor_state(tab_name)
             for row_data in self._raw_row_generator_for_tab_name(tab_name):
                 processed_row_data: AnyJsonData = self._process_row(tab_name, state, row_data)
+                if not processed_row_data:
+                    break
+                if processed_row_data is CommentRow:
+                    continue
                 sheet_content.append(processed_row_data)
-            self.content_by_tab_name[tab_name] = sheet_content
+            normalized_tab_name = to_snake_case(tab_name.replace(" ", ""))
+            self.content_by_tab_name[normalized_tab_name] = sheet_content
         return self.content_by_tab_name
 
-    def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData:
-        return prefer_number(value) if self.prefer_number else value
+    def parse_cell_value(self, value: SheetCellValue, override_prefer_number: Optional[bool] = None) -> AnyJsonData:
+        if override_prefer_number is None:
+            override_prefer_number = self.prefer_number
+        return prefer_number(value) if override_prefer_number else value
 
 
 class TableSetManagerRegistry:
@@ -355,8 +363,9 @@ class XlsxManager(FlattenedTableSetManager):
     """
     This implements the mechanism to get a series of rows out of the sheets in an XLSX file.
     """
-
     ALLOWED_FILE_EXTENSIONS = ['.xlsx']
+    TERMINATE_ON_EMPTY_ROW = True
+    CONVERT_VALUES_TO_STRING = True
 
     @classmethod
     def _all_rows(cls, sheet: Worksheet):
@@ -383,21 +392,66 @@ def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]:
                 for row in self._all_rows(sheet))
 
     def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow:
-        return [sheet.cell(row=row, column=col).value
-                for col in self._all_cols(sheet)]
+        # return [str(sheet.cell(row=row, column=col).value) for col in self._all_cols(sheet)]
+        results = []
+        for col in self._all_cols(sheet):
+            value = sheet.cell(row=row, column=col).value
+            if value is not None and XlsxManager.CONVERT_VALUES_TO_STRING:
+                value = str(value).strip()
+            results.append(value)
+        return results
 
     def _create_tab_processor_state(self, tab_name: str) -> Headers:
         sheet = self.reader_agent[tab_name]
+        """
         headers: Headers = [str(sheet.cell(row=1, column=col).value)
                             for col in self._all_cols(sheet)]
-        self.headers_by_tab_name[sheet.title] = headers
-        return headers
+        """
+        headers = []
+        for col in self._all_cols(sheet):
+            cell = sheet.cell(row=1, column=col).value
+            if cell is not None and XlsxManager.CONVERT_VALUES_TO_STRING:
+                cell = str(cell).strip()
+            headers.append(cell)
+        self.headers_by_tab_name[sheet.title] = XlsxManager.remove_trailing_none_values(headers)
+        return self.headers_by_tab_name[sheet.title]
 
     def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> AnyJsonData:
         ignored(tab_name)
-        return {headers[i]: self.parse_cell_value(row_datum)
+        if XlsxManager.is_terminating_row(row_data):
+            return None
+        if XlsxManager.is_comment_row(row_data):
+            return CommentRow
+        if len(headers) < len(row_data):
+            row_data = row_data[:len(headers)]
+        override_prefer_number = False if XlsxManager.CONVERT_VALUES_TO_STRING else None
+        return {headers[i]: self.parse_cell_value(row_datum, override_prefer_number=override_prefer_number)
                 for i, row_datum in enumerate(row_data)}
 
+    @staticmethod
+    def is_terminating_row(row: List[Optional[Any]]) -> bool:
+        # TODO: This is may change; currently an all blank row signals the end of input.
+        return all(cell is None for cell in row) and XlsxManager.TERMINATE_ON_EMPTY_ROW
+
+    @staticmethod
+    def is_comment_row(row: Tuple[Optional[Any]]) -> bool:
+        # TODO: This will probably change; currently a row starting only with #, *, or ^ signals a comment.
+        for cell in row:
+            if cell is None:
+                continue
+            if (cell := str(cell)).startswith("#") or cell.startswith("*") or cell.startswith("^"):
+                return True
+        return False
+
+    @staticmethod
+    def remove_trailing_none_values(values: list[Any]) -> list[Any]:
+        for index in range(len(values) - 1, -1, -1):
+            if values[index] is not None:
+                break
+        else:
+            return []
+        return values[:index + 1]
+
 
 def infer_tab_name_from_filename(filename):
     return os.path.basename(filename).split('.')[0]
@@ -688,6 +742,7 @@ def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[
 
     @classmethod
     def load_annotated(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None,
+                       retain_empty_properties: bool = False,
                        **kwargs) -> Dict:
         """
         Given a filename and various options
@@ -697,6 +752,8 @@ def load_annotated(cls, filename: str, tab_name: Optional[str] = None, escaping:
             manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping,
                                                         **kwargs)
             content: TabbedSheetData = manager.load_content()
+            if not retain_empty_properties:
+                remove_empty_properties(content)
             return {
                 'filename': filename,
                 'content': content,
@@ -709,3 +766,16 @@ def load_annotated(cls, filename: str, tab_name: Optional[str] = None, escaping:
 
 
 load_table_set = TableSetManager.load
+load_table_annotated = TableSetManager.load_annotated
+
+def remove_empty_properties(data: Optional[Union[list,dict]]) -> None:
+    if isinstance(data, dict):
+        for key in list(data.keys()):
+            value = data[key]
+            if not value:
+                del data[key]
+            else:
+                remove_empty_properties(value)
+    elif isinstance(data, list):
+        for item in data:
+            remove_empty_properties(item)
diff --git a/dcicutils/validation_utils.py b/dcicutils/validation_utils.py
index 35014a3cf..f6612067c 100644
--- a/dcicutils/validation_utils.py
+++ b/dcicutils/validation_utils.py
@@ -63,6 +63,7 @@ def fetch_schema(self, schema_name: str):
             return override_schema
         schema: Optional[AnyJsonData] = self.SCHEMA_CACHE.get(schema_name)
         if schema is None and schema_name not in self.SCHEMA_CACHE:  # If None is already stored, don't look it up again
+            schema_name = schema_name.replace(" ", "")
             schema = get_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp)
             self.SCHEMA_CACHE[schema_name] = schema
         return schema

From 3de7611e2a47b6796737e9dff9ca22253b1cb944 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Wed, 8 Nov 2023 13:54:49 -0500
Subject: [PATCH 04/49] Changes to bundle_utils/etc related to SMaHT ingestion
 work.

---
 dcicutils/sheet_utils.py | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py
index 8411c3cb2..30fcd88f0 100644
--- a/dcicutils/sheet_utils.py
+++ b/dcicutils/sheet_utils.py
@@ -302,8 +302,7 @@ def load_content(self) -> AnyJsonData:
                 if processed_row_data is CommentRow:
                     continue
                 sheet_content.append(processed_row_data)
-            normalized_tab_name = to_snake_case(tab_name.replace(" ", ""))
-            self.content_by_tab_name[normalized_tab_name] = sheet_content
+            self.content_by_tab_name[tab_name] = sheet_content
         return self.content_by_tab_name
 
     def parse_cell_value(self, value: SheetCellValue, override_prefer_number: Optional[bool] = None) -> AnyJsonData:
@@ -752,8 +751,6 @@ def load_annotated(cls, filename: str, tab_name: Optional[str] = None, escaping:
             manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping,
                                                         **kwargs)
             content: TabbedSheetData = manager.load_content()
-            if not retain_empty_properties:
-                remove_empty_properties(content)
             return {
                 'filename': filename,
                 'content': content,
@@ -767,15 +764,3 @@ def load_annotated(cls, filename: str, tab_name: Optional[str] = None, escaping:
 
 load_table_set = TableSetManager.load
 load_table_annotated = TableSetManager.load_annotated
-
-def remove_empty_properties(data: Optional[Union[list,dict]]) -> None:
-    if isinstance(data, dict):
-        for key in list(data.keys()):
-            value = data[key]
-            if not value:
-                del data[key]
-            else:
-                remove_empty_properties(value)
-    elif isinstance(data, list):
-        for item in data:
-            remove_empty_properties(item)

From 33b64aece187c8d7361a7f229f024fbaaf84d7a9 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Wed, 8 Nov 2023 22:49:01 -0500
Subject: [PATCH 05/49] Changes to bundle_utils/etc related to SMaHT ingestion
 work.

---
 dcicutils/bundle_utils.py          |  5 ++--
 dcicutils/s3_utils.py              |  2 +-
 dcicutils/sheet_utils.py           |  2 +-
 test/data_files/sample_items2.json |  2 +-
 test/test_bundle_utils.py          | 38 +++++++++++++++++++-----
 test/test_qa_utils.py              |  2 +-
 test/test_sheet_utils.py           | 47 ++++++++++++++++++++++++++----
 7 files changed, 78 insertions(+), 20 deletions(-)

diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py
index a55b37e3a..04c6607f0 100644
--- a/dcicutils/bundle_utils.py
+++ b/dcicutils/bundle_utils.py
@@ -687,11 +687,10 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional
     return checked_items
 
 
-def remove_empty_properties(data: Optional[Union[list,dict]]) -> None:
+def remove_empty_properties(data: Optional[Union[list, dict]]) -> None:
     if isinstance(data, dict):
         for key in list(data.keys()):
-            value = data[key]
-            if not value:
+            if (value := data[key]) is None:
                 del data[key]
             else:
                 remove_empty_properties(value)
diff --git a/dcicutils/s3_utils.py b/dcicutils/s3_utils.py
index e34a3f2b7..ea1897beb 100644
--- a/dcicutils/s3_utils.py
+++ b/dcicutils/s3_utils.py
@@ -318,7 +318,7 @@ def get_key(self, keyfile_name: str = 'access_key_admin'
                                       SSECustomerKey=os.environ['S3_ENCRYPT_KEY'],
                                       SSECustomerAlgorithm='AES256')
         body_data: Union[bytes, str] = response['Body'].read()
-        auth_text: str = body_data.decode() if type(body_data) == bytes else body_data
+        auth_text: str = body_data.decode() if type(body_data) is bytes else body_data
         try:
             return json.loads(auth_text)
         except (ValueError, TypeError):
diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py
index 30fcd88f0..370c22bd8 100644
--- a/dcicutils/sheet_utils.py
+++ b/dcicutils/sheet_utils.py
@@ -17,7 +17,7 @@
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union
 from .common import AnyJsonData, Regexp, JsonSchema
 from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize
-from .misc_utils import ignored, pad_to, JsonLinesReader, remove_suffix, to_snake_case
+from .misc_utils import ignored, pad_to, JsonLinesReader, remove_suffix
 
 
 Header = str
diff --git a/test/data_files/sample_items2.json b/test/data_files/sample_items2.json
index 7e084f908..0f3b22f90 100644
--- a/test/data_files/sample_items2.json
+++ b/test/data_files/sample_items2.json
@@ -1,6 +1,6 @@
 [
     {"name": "john", "sex": "Male", "member": false},
     {"name": "juan", "sex": "Male", "member": true},
-    {"name": "igor", "sex": "unknown", "member": null},
+    {"name": "igor", "sex": "unknown", "member": ""},
     {"name": "mary", "sex": "Female", "member": true}
 ]
diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py
index 6e1da6685..c739c8504 100644
--- a/test/test_bundle_utils.py
+++ b/test/test_bundle_utils.py
@@ -4,6 +4,7 @@
 import os
 import pytest
 import re
+from typing import List, Optional, Union
 
 from dcicutils import (
     bundle_utils as bundle_utils_module,
@@ -172,7 +173,7 @@ def test_item_tools_parse_item_value_basic():
 
         # Nulls
         (None, None),
-        ('', None), ('null', None), ('Null', None), ('NULL', None),
+        ('', ''), ('null', None), ('Null', None), ('NULL', None),
 
         # Booleans
         ('true', True), ('True', True), ('TRUE', True),
@@ -191,9 +192,9 @@ def test_item_tools_parse_item_value_basic():
         ('|', []),  # special case: lone '|' means empty
         ('alpha|', ['alpha']), ('7|', [7]),  # special case: trailing '|' means singleton
         # These follow from general case of '|' as separator of items recursively parsed
-        ('|alpha', [None, 'alpha']), ('|alpha|', [None, 'alpha']), ('|7', [None, 7]),
+        ('|alpha', ['', 'alpha']), ('|alpha|', ['', 'alpha']), ('|7', ['', 7]),
         ('alpha|beta|gamma', ['alpha', 'beta', 'gamma']),
-        ('alpha|true|false|null||7|1.5', ['alpha', True, False, None, None, 7, 1.5])
+        ('alpha|true|false|null||7|1.5', ['alpha', True, False, None, '', 7, 1.5])
     ]
 
     for input, heuristic_result in expectations:
@@ -271,6 +272,7 @@ def test_load_table_structures():
     print("loaded=", json.dumps(loaded, indent=2))
     expected = SAMPLE_JSON_TABS_FILE_ITEM_CONTENT
     print("expected=", json.dumps(expected, indent=2))
+    # assert loaded == expected
     assert loaded == expected
 
     with pytest.raises(LoadArgumentsError) as exc:
@@ -292,9 +294,13 @@ def test_load_items():
     #     mock_get_schema.return_value = {}
     with no_schemas():
 
-        assert load_items(SAMPLE_XLSX_FILE, apply_heuristics=True) == SAMPLE_XLSX_FILE_ITEM_CONTENT
-        assert load_items(SAMPLE_CSV_FILE, apply_heuristics=True) == SAMPLE_CSV_FILE_ITEM_CONTENT
-        assert load_items(SAMPLE_TSV_FILE, apply_heuristics=True) == SAMPLE_TSV_FILE_ITEM_CONTENT
+        # TODO: Some work still to do WRT sorting out between None and empty strings in general.
+        assert load_items(SAMPLE_XLSX_FILE, apply_heuristics=True) == \
+            remove_empty_properties(SAMPLE_XLSX_FILE_ITEM_CONTENT)
+        assert remove_empty_properties(load_items(SAMPLE_CSV_FILE, apply_heuristics=True), [None, '']) == \
+            SAMPLE_CSV_FILE_ITEM_CONTENT
+        assert remove_empty_properties(load_items(SAMPLE_TSV_FILE, apply_heuristics=True), [None, '']) == \
+            SAMPLE_TSV_FILE_ITEM_CONTENT
 
         with pytest.raises(LoadArgumentsError) as exc:
             load_items("something.else")
@@ -329,7 +335,7 @@ def test_load_items():
     SAMPLE_CSV_FILE2_SHEET_NAME: [
         {"name": "john", "sex": "M", "member": False},
         {"name": "juan", "sex": "male", "member": True},
-        {"name": "igor", "sex": "unknown", "member": None},
+        {"name": "igor", "sex": "unknown", "member": ""},
         {"name": "mary", "sex": "Female", "member": "t"}
     ]
 }
@@ -338,7 +344,7 @@ def test_load_items():
     "Person": [
         {"name": "john", "sex": "Male", "member": False},
         {"name": "juan", "sex": "Male", "member": True},
-        {"name": "igor", "sex": "unknown", "member": None},
+        {"name": "igor", "sex": "unknown", "member": ""},
         {"name": "mary", "sex": "Female", "member": True}
     ]
 }
@@ -733,3 +739,19 @@ def test_table_checker():
                                flattened=True,
                                portal_env=mock_ff_env)
         checker.check_tabs()
+
+
+def remove_empty_properties(data: Optional[Union[list, dict]],
+                            empty: Optional[List[Optional[str]]] = None) -> Optional[Union[list, dict]]:
+    if not empty:
+        empty = [None]
+    if isinstance(data, dict):
+        for key in list(data.keys()):
+            if (value := data[key]) in empty:
+                del data[key]
+            else:
+                remove_empty_properties(value, empty)
+    elif isinstance(data, list):
+        for item in data:
+            remove_empty_properties(item, empty)
+    return data
diff --git a/test/test_qa_utils.py b/test/test_qa_utils.py
index bed502166..6e9c606bf 100644
--- a/test/test_qa_utils.py
+++ b/test/test_qa_utils.py
@@ -568,7 +568,7 @@ def test_occasionally_errors():
                                         error_message="something")
         fail_for_a_while(1)
     except Exception as e:
-        assert type(e) == SyntaxError
+        assert type(e) is SyntaxError
         assert str(e) == "something"
 
 
diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py
index c012772c1..6cb2405e3 100644
--- a/test/test_sheet_utils.py
+++ b/test/test_sheet_utils.py
@@ -1,6 +1,7 @@
 import json
 import os
 import pytest
+from typing import Optional, Union
 
 from collections import namedtuple
 from dcicutils.sheet_utils import (
@@ -116,7 +117,7 @@ def test_table_set_manager_registry_manager_for_filename():
 
 SAMPLE_XLSX_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.xlsx')
 
-SAMPLE_XLSX_FILE_RAW_CONTENT = {
+old_SAMPLE_XLSX_FILE_RAW_CONTENT = {
     "Sheet1": [
         {"x": 1, "y.a": 1, "y.z": 1},
         {"x": 1, "y.a": 2, "y.z": 3},
@@ -139,6 +140,29 @@ def test_table_set_manager_registry_manager_for_filename():
         },
     ]
 }
+SAMPLE_XLSX_FILE_RAW_CONTENT = {
+    "Sheet1": [
+        {"x": "1", "y.a": "1", "y.z": "1"},
+        {"x": "1", "y.a": "2", "y.z": "3"},
+        {"x": "alpha", "y.a": "beta", "y.z": "gamma|delta"},
+    ],
+    "Sheet2": [
+        {
+            "name": "bill", "age": "23",
+            "mother.name": "mary", "mother.age": "58",
+            "father.name": "fred", "father.age": "63",
+            "friends#0.name": "sam", "friends#0.age": "22",
+            "friends#1.name": "arthur", "friends#1.age": "19",
+        },
+        {
+            "name": "joe", "age": "9",
+            "mother.name": "estrella", "mother.age": "35",
+            "father.name": "anthony", "father.age": "34",
+            "friends#0.name": "anders", "friends#0.age": "9",
+            "friends#1.name": None, "friends#1.age": None,
+        },
+    ]
+}
 
 SAMPLE_XLSX_FILE_INFLATED_CONTENT = {
     "Sheet1": [
@@ -243,11 +267,11 @@ def test_xlsx_manager_load_csv():
 
 def test_csv_manager_load_content():
     wt = CsvManager(SAMPLE_CSV_FILE)
-    assert wt.load_content() == SAMPLE_CSV_FILE_RAW_CONTENT
+    assert convert_number_to_string(wt.load_content()) == SAMPLE_CSV_FILE_RAW_CONTENT
 
 
 def test_csv_manager_load():
-    assert CsvManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_RAW_CONTENT
+    assert convert_number_to_string(CsvManager.load(SAMPLE_CSV_FILE)) == SAMPLE_CSV_FILE_RAW_CONTENT
 
 
 def test_csv_manager_load_csv():
@@ -269,11 +293,11 @@ def test_csv_escaping():
 
 def test_tsv_manager_load_content():
     wt = TsvManager(SAMPLE_TSV_FILE)
-    assert wt.load_content() == SAMPLE_TSV_FILE_RAW_CONTENT
+    assert convert_number_to_string(wt.load_content()) == SAMPLE_TSV_FILE_RAW_CONTENT
 
 
 def test_tsv_manager_load():
-    assert TsvManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_RAW_CONTENT
+    assert convert_number_to_string(TsvManager.load(SAMPLE_TSV_FILE)) == SAMPLE_TSV_FILE_RAW_CONTENT
 
 
 def test_tsv_manager_load_csv():
@@ -281,3 +305,16 @@ def test_tsv_manager_load_csv():
         TsvManager.load(SAMPLE_XLSX_FILE)
     assert str(exc.value).startswith('The TableSetManager subclass TsvManager'
                                      ' expects only .tsv or .tsv.txt filenames:')
+
+
+def convert_number_to_string(data: Optional[Union[list, dict]]) -> Optional[Union[list, dict]]:
+    if isinstance(data, dict):
+        for key in list(data.keys()):
+            if isinstance(value := data[key], int):
+                data[key] = str(value)
+            else:
+                convert_number_to_string(value)
+    elif isinstance(data, list):
+        for item in data:
+            convert_number_to_string(item)
+    return data

From 701384f05e4b6f4809157b7804aada5f15b7fa4d Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Wed, 8 Nov 2023 22:54:34 -0500
Subject: [PATCH 06/49] Changes to bundle_utils/etc related to SMaHT ingestion
 work.

---
 dcicutils/validation_utils.py | 12 +++++-------
 test/test_validation_utils.py |  8 ++++----
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/dcicutils/validation_utils.py b/dcicutils/validation_utils.py
index f6612067c..56e473501 100644
--- a/dcicutils/validation_utils.py
+++ b/dcicutils/validation_utils.py
@@ -87,16 +87,14 @@ def identifying_properties(self, schema: Optional[JsonSchema] = None, schema_nam
         return identifying_properties
 
     @classmethod  # This operation doesn't actually use the schemas so is safe as a class method
-    def identifying_value(cls, data_item: Dict[str, AnyJsonData], identifying_properties) -> AnyJsonData:
+    def identifying_value(cls, data_item: Dict[str, AnyJsonData],
+                          identifying_properties, raise_exception: bool = False) -> AnyJsonData:
+        if not identifying_properties and raise_exception:
+            raise ValueError("No identifying properties were specified.")
         for identifying_property in identifying_properties or []:
             if identifying_property in data_item:
                 return data_item[identifying_property]
-        if False:
-            if not identifying_properties:
-                raise ValueError("No identifying properties were specified.")
-            for identifying_property in identifying_properties:
-                if identifying_property in data_item:
-                    return data_item[identifying_property]
+        if raise_exception:
             raise ValueError(f'{there_are(identifying_properties, just_are=True)}'
                              f' no {maybe_pluralize(identifying_properties, "identifying property")}'
                              f' {disjoined_list([repr(x) for x in identifying_properties])}'
diff --git a/test/test_validation_utils.py b/test/test_validation_utils.py
index 40878a222..9129e05c0 100644
--- a/test/test_validation_utils.py
+++ b/test/test_validation_utils.py
@@ -73,20 +73,20 @@ def test_schema_manager_with_schemas(schema_id):
 def test_schema_manager_identifying_value():
 
     with pytest.raises(ValueError) as exc:
-        assert SchemaManager.identifying_value({'any': 'thing'}, [])
+        assert SchemaManager.identifying_value({'any': 'thing'}, [], raise_exception=True)
     assert str(exc.value) == "No identifying properties were specified."
 
     person_named_fred = {'age': 33, 'name': 'Fred', 'favorite-color': 'yellow'}
-    assert SchemaManager.identifying_value(person_named_fred, ['uuid', 'name']) == 'Fred'
+    assert SchemaManager.identifying_value(person_named_fred, ['uuid', 'name'], raise_exception=True) == 'Fred'
 
     person_nicknamed_fred = {'age': 33, 'nickname': 'Fred', 'favorite-color': 'yellow'}
     with pytest.raises(ValueError) as exc:
-        SchemaManager.identifying_value(person_nicknamed_fred, ['uuid', 'name'])
+        SchemaManager.identifying_value(person_nicknamed_fred, ['uuid', 'name'], raise_exception=True)
     assert str(exc.value) == ("""There are no identifying properties 'uuid' or 'name'"""
                               """ in {"age": 33, "nickname": "Fred", "favorite-color": "yellow"}.""")
 
     with pytest.raises(ValueError) as exc:
-        SchemaManager.identifying_value(person_nicknamed_fred, ['name'])
+        SchemaManager.identifying_value(person_nicknamed_fred, ['name'], raise_exception=True)
     assert str(exc.value) == ("""There is no identifying property 'name'"""
                               """ in {"age": 33, "nickname": "Fred", "favorite-color": "yellow"}.""")
 

From d49fd69f27e02a1f089e2cc96babc788156c9b3b Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Wed, 8 Nov 2023 22:57:45 -0500
Subject: [PATCH 07/49] Changes to bundle_utils/etc related to SMaHT ingestion
 work.

---
 dcicutils/sheet_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py
index 370c22bd8..54ca37fac 100644
--- a/dcicutils/sheet_utils.py
+++ b/dcicutils/sheet_utils.py
@@ -443,7 +443,7 @@ def is_comment_row(row: Tuple[Optional[Any]]) -> bool:
         return False
 
     @staticmethod
-    def remove_trailing_none_values(values: list[Any]) -> list[Any]:
+    def remove_trailing_none_values(values: List[Any]) -> list[Any]:
         for index in range(len(values) - 1, -1, -1):
             if values[index] is not None:
                 break

From 4e21b859f6d13fe143612cddd139cfa0054f090b Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Wed, 8 Nov 2023 22:59:23 -0500
Subject: [PATCH 08/49] Changes to bundle_utils/etc related to SMaHT ingestion
 work.

---
 dcicutils/sheet_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py
index 54ca37fac..771f06bcd 100644
--- a/dcicutils/sheet_utils.py
+++ b/dcicutils/sheet_utils.py
@@ -443,7 +443,7 @@ def is_comment_row(row: Tuple[Optional[Any]]) -> bool:
         return False
 
     @staticmethod
-    def remove_trailing_none_values(values: List[Any]) -> list[Any]:
+    def remove_trailing_none_values(values: List[Any]) -> List[Any]:
         for index in range(len(values) - 1, -1, -1):
             if values[index] is not None:
                 break

From 4e55f2914cdd85bdf9ecf423c4b25cfd92620173 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Thu, 9 Nov 2023 16:33:21 -0500
Subject: [PATCH 09/49] Changes to bundle_utils/etc related to SMaHT ingestion
 work.

---
 dcicutils/bundle_utils.py           | 39 ++++++++++++++++++-----------
 dcicutils/sheet_utils.py            | 20 ++++++++-------
 dcicutils/validation_utils.py       |  8 +++---
 test/data_files/escaping-false.json | 16 ++++++------
 test/data_files/escaping-true.json  | 16 ++++++------
 test/test_bundle_utils.py           |  7 +++---
 test/test_sheet_utils.py            | 28 ++++++++++-----------
 7 files changed, 75 insertions(+), 59 deletions(-)

diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py
index 04c6607f0..6e0dc4553 100644
--- a/dcicutils/bundle_utils.py
+++ b/dcicutils/bundle_utils.py
@@ -74,11 +74,11 @@ def apply_hint(self, value):
 
 class NumHint(TypeHint):
 
-    PREFERENCE_MAP = {'number': 'num', 'integer': 'int', 'float': 'float'}
+    PREFERENCE_MAP = {'number': 'number', 'integer': 'integer'}
 
     def __init__(self, declared_type: Optional[str] = None):
         if declared_type is None:
-            declared_type = 'num'
+            declared_type = 'number'
         self.preferred_type = self.PREFERENCE_MAP.get(declared_type)
 
     def apply_hint(self, value):
@@ -347,7 +347,7 @@ def find_type_hint_for_subschema(cls, subschema: Any, context: Optional[TypeHint
                 if link_to and context.schema_exists(link_to):
                     return RefHint(schema_name=link_to, context=context)
                 return StringHint()
-            elif t in ('integer', 'float', 'number'):
+            elif t in ('integer', 'number'):
                 return NumHint(declared_type=t)
             elif t == 'boolean':
                 return BoolHint()
@@ -452,7 +452,8 @@ class TableChecker(InflatableTabbedDataManager, TypeHintContext):
     def __init__(self, tabbed_sheet_data: TabbedSheetData, *, flattened: bool,
                  override_schemas: Optional[TabbedJsonSchemas] = None,
                  portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None,
-                 apply_heuristics: bool = False):
+                 apply_heuristics: bool = False,
+                 noschemas: bool = False):
 
         self.flattened = flattened
         # if not flattened:
@@ -462,7 +463,7 @@ def __init__(self, tabbed_sheet_data: TabbedSheetData, *, flattened: bool,
         #     # -kmp 25-Oct-2023
         #     raise ValueError("Only flattened=True is supported by TableChecker for now.")
 
-        if portal_env is None and portal_vapp is None:
+        if portal_env is None and portal_vapp is None and not noschemas:
             portal_env = public_env_name(EnvUtils.PRD_ENV_NAME)
         # InflatableTabbedDataManager supplies:
         #   self.tabbed_sheet_data: TabbedSheetData =
@@ -473,9 +474,14 @@ def __init__(self, tabbed_sheet_data: TabbedSheetData, *, flattened: bool,
         super().__init__(tabbed_sheet_data=tabbed_sheet_data, apply_heuristics=apply_heuristics)
         self.portal_env = portal_env
         self.portal_vapp = portal_vapp
-        self.schema_manager: SchemaManager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp,
-                                                           override_schemas=override_schemas)
-        self.schemas = self.schema_manager.fetch_relevant_schemas(self.tab_names)  # , schemas=schemas)
+        if not noschemas:
+            self.schema_manager: SchemaManager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp,
+                                                               override_schemas=override_schemas, noschemas=noschemas)
+            schema_names_to_fetch = [key for key, value in tabbed_sheet_data.items() if value]
+            self.schemas = self.schema_manager.fetch_relevant_schemas(schema_names_to_fetch) if not noschemas else None # , schemas=schemas)
+        else:
+            self.schema_manager = None
+            self.schemas = {}
         self.lookup_tables_by_tab_name: Dict[str, Dict[str, Dict]] = {
             tab_name: self.build_lookup_table_for_tab(tab_name, rows=rows)
             for tab_name, rows in tabbed_sheet_data.items()
@@ -496,6 +502,8 @@ def note_problem(self, problem: str):
         self._problems.append(problem)
 
     def build_lookup_table_for_tab(self, tab_name: str, *, rows: List[Dict]) -> Dict[str, Dict]:
+        if not self.schema_manager or not rows:
+            return []
         identifying_properties = self.schema_manager.identifying_properties(schema_name=tab_name)
         if not identifying_properties:
             # Maybe issue a warning here that we're going to lose
@@ -554,7 +562,7 @@ def validate_ref(self, item_type, item_ref):
             return False
 
     def schema_exists(self, schema_name: str) -> bool:
-        return self.schema_manager.schema_exists(schema_name)
+        return self.schema_manager.schema_exists(schema_name) if self.schema_manager else False
 
     def check_tab(self, tab_name: str):
         prototype = self.patch_prototypes_by_tab_name[tab_name]
@@ -617,10 +625,11 @@ def check(cls, tabbed_sheet_data: TabbedSheetData, *,
               flattened: bool,
               override_schemas: Optional[TabbedJsonSchemas] = None,
               apply_heuristics: bool = False,
+              noschemas: bool = False,
               portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None):
         checker = cls(tabbed_sheet_data, flattened=flattened,
                       override_schemas=override_schemas, apply_heuristics=apply_heuristics,
-                      portal_env=portal_env, portal_vapp=portal_vapp)
+                      portal_env=portal_env, portal_vapp=portal_vapp, noschemas=noschemas)
         checked = checker.check_tabs()
         return checked
 
@@ -654,7 +663,7 @@ def create_tab_processor_state(self, tab_name: str) -> SheetState:
                                type_hints=self.type_hints_by_tab_name[tab_name])
 
 
-check = TableChecker.check
+# check = TableChecker.check
 
 
 def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None,
@@ -664,6 +673,7 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional
                #       but for production use maybe should not be? -kmp 25-Oct-2023
                validate: bool = False,
                retain_empty_properties: bool = False,
+               noschemas: bool = False,
                **kwargs):
     annotated_data = TableSetManager.load_annotated(filename=filename, tab_name=tab_name, escaping=escaping,
                                                     prefer_number=False, **kwargs)
@@ -673,14 +683,15 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional
         checked_items = TableChecker.check(tabbed_rows, flattened=flattened,
                                            override_schemas=override_schemas,
                                            portal_env=portal_env, portal_vapp=portal_vapp,
-                                           apply_heuristics=apply_heuristics)
+                                           apply_heuristics=apply_heuristics,
+                                           noschemas=noschemas)
     else:
         # No fancy checking for things like .json, etc. for now. Only check things that came from
         # spreadsheet-like data, where structural datatypes are forced into strings.
         checked_items = tabbed_rows
     if not retain_empty_properties:
         remove_empty_properties(checked_items)
-    if validate:
+    if validate and not noschemas:
         problems = validate_data_against_schemas(checked_items, portal_env=portal_env, portal_vapp=portal_vapp,
                                                  override_schemas=override_schemas)
         return checked_items, problems
@@ -690,7 +701,7 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional
 def remove_empty_properties(data: Optional[Union[list, dict]]) -> None:
     if isinstance(data, dict):
         for key in list(data.keys()):
-            if (value := data[key]) is None:
+            if (value := data[key]) in [None, ""]:
                 del data[key]
             else:
                 remove_empty_properties(value)
diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py
index 771f06bcd..b3f191f15 100644
--- a/dcicutils/sheet_utils.py
+++ b/dcicutils/sheet_utils.py
@@ -70,7 +70,7 @@ def unwanted_kwargs(*, context, kwargs, context_plural=False, detailed=False):
                                      f" {maybe_pluralize(unwanted, 'keyword argument')} {conjoined_list(unwanted)}.")
 
 
-def prefer_number(value: SheetCellValue, kind='num'):
+def prefer_number(value: SheetCellValue, kind='number'):
     """
     Given a string, if the string has number syntax, returns the number it represents. Otherwise, returns its argument.
     (It follows from this that if given an int or a float, it just returns that argument.)
@@ -79,7 +79,7 @@ def prefer_number(value: SheetCellValue, kind='num'):
     is coerced to, but it has no effect when the argument is a number, even a number of the wrong kind.
 
     :param value: a string, int, or float
-    :param kind: one of 'num', 'int', or 'float'
+    :param kind: one of 'number' or 'integer'
     :returns: the argument coerced to a number of the appropriate kind, if possible, or else the argument literally
     """
     if isinstance(value, str):  # the given value might be an int or float, in which case just fall through
@@ -88,12 +88,12 @@ def prefer_number(value: SheetCellValue, kind='num'):
         value = value
         ch0 = value[0]
         if ch0 == '+' or ch0 == '-' or ch0.isdigit():
-            if kind == 'num' or kind == 'int':
+            if kind == 'integer':
                 try:
                     return int(value)
                 except Exception:
                     pass
-            if kind == 'num' or kind == 'float':
+            if kind == 'number':
                 try:
                     return float(value)
                 except Exception:
@@ -302,7 +302,7 @@ def load_content(self) -> AnyJsonData:
                 if processed_row_data is CommentRow:
                     continue
                 sheet_content.append(processed_row_data)
-            self.content_by_tab_name[tab_name] = sheet_content
+            self.content_by_tab_name[tab_name.replace(" ", "")] = sheet_content
         return self.content_by_tab_name
 
     def parse_cell_value(self, value: SheetCellValue, override_prefer_number: Optional[bool] = None) -> AnyJsonData:
@@ -395,8 +395,10 @@ def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow:
         results = []
         for col in self._all_cols(sheet):
             value = sheet.cell(row=row, column=col).value
-            if value is not None and XlsxManager.CONVERT_VALUES_TO_STRING:
-                value = str(value).strip()
+#           if value is not None and XlsxManager.CONVERT_VALUES_TO_STRING:
+#               value = str(value).strip()
+            if XlsxManager.CONVERT_VALUES_TO_STRING:
+                value = str(value).strip() if value is not None else ""
             results.append(value)
         return results
 
@@ -642,10 +644,10 @@ def _escape_cell_text(cls, cell_text):
     def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> AnyJsonData:
         ignored(tab_name)
         if self.escaping:
-            return {headers[i]: self.parse_cell_value(self._escape_cell_text(cell_text))
+            return {headers[i]: self.parse_cell_value(self._escape_cell_text(cell_text), override_prefer_number=False)
                     for i, cell_text in enumerate(row_data)}
         else:
-            return {headers[i]: self.parse_cell_value(cell_text)
+            return {headers[i]: self.parse_cell_value(cell_text, override_prefer_number=False)
                     for i, cell_text in enumerate(row_data)}
 
 
diff --git a/dcicutils/validation_utils.py b/dcicutils/validation_utils.py
index 56e473501..e3a0e050c 100644
--- a/dcicutils/validation_utils.py
+++ b/dcicutils/validation_utils.py
@@ -27,10 +27,10 @@ def fresh_schema_manager_context_for_testing(cls):
         # finally:
         #     cls.SCHEMA_CACHE = old_schema_cache
 
-    def __init__(self, *, override_schemas: Optional[TabbedJsonSchemas] = None,
+    def __init__(self, *, override_schemas: Optional[TabbedJsonSchemas] = None, noschemas: bool = False,
                  portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None):
         self.SCHEMA_CACHE = {}  # Shared cache. Do not override. Use .clear_schema_cache() to clear it.
-        if portal_env is None and portal_vapp is None:
+        if portal_env is None and portal_vapp is None and not noschemas:
             portal_env = public_env_name(EnvUtils.PRD_ENV_NAME)
             PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.")
         self.portal_env = portal_env
@@ -174,7 +174,9 @@ def validate_data_against_schemas(data: TabbedSheetData, *,
     schema_manager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp, override_schemas=override_schemas)
 
     errors = []
-    schemas = schema_manager.fetch_relevant_schemas(list(data.keys()))
+
+    schema_names_to_fetch = [key for key, value in data.items() if value]
+    schemas = schema_manager.fetch_relevant_schemas(schema_names_to_fetch)
 
     for data_type in data:
         schema = schemas.get(data_type)
diff --git a/test/data_files/escaping-false.json b/test/data_files/escaping-false.json
index 84ab06993..3c95926d9 100644
--- a/test/data_files/escaping-false.json
+++ b/test/data_files/escaping-false.json
@@ -5,56 +5,56 @@
       "unquoted": "\\\\",
       "doublequoted": "\\\\",
       "singlequoted": "'\\\\'",
-      "overflow": null
+      "overflow": ""
     },
     {
       "name": "formfeed",
       "unquoted": "\\f",
       "doublequoted": "\\f",
       "singlequoted": "'\\f'",
-      "overflow": null
+      "overflow": ""
     },
     {
       "name": "newline",
       "unquoted": "\\n",
       "doublequoted": "\\n",
       "singlequoted": "'\\n'",
-      "overflow": null
+      "overflow": ""
     },
     {
       "name": "return",
       "unquoted": "\\r",
       "doublequoted": "\\r",
       "singlequoted": "'\\r'",
-      "overflow": null
+      "overflow": ""
     },
     {
       "name": "tab",
       "unquoted": "\\t",
       "doublequoted": "\\t",
       "singlequoted": "'\\t'",
-      "overflow": null
+      "overflow": ""
     },
     {
       "name": "misc",
       "unquoted": "\\m",
       "doublequoted": "\\m",
       "singlequoted": "'\\m'",
-      "overflow": null
+      "overflow": ""
     },
     {
       "name": "quote1",
       "unquoted": "N/A",
       "doublequoted": "x,,z",
       "singlequoted": "N/A",
-      "overflow": null
+      "overflow": ""
     },
     {
       "name": "quotelong",
       "unquoted": "N/A",
       "doublequoted": "x,,z,N/A\nquotlongcontinued,",
       "singlequoted": "N/A",
-      "overflow": null
+      "overflow": ""
     },
     {
       "name": "comma",
diff --git a/test/data_files/escaping-true.json b/test/data_files/escaping-true.json
index 5d6c837a6..bae122afd 100644
--- a/test/data_files/escaping-true.json
+++ b/test/data_files/escaping-true.json
@@ -5,56 +5,56 @@
       "unquoted": "\\",
       "doublequoted": "\\",
       "singlequoted": "'\\'",
-      "overflow": null
+      "overflow": ""
     },
     {
       "name": "formfeed",
       "unquoted": "\\f",
       "doublequoted": "\\f",
       "singlequoted": "'\\f'",
-      "overflow": null
+      "overflow": ""
     },
     {
       "name": "newline",
       "unquoted": "\n",
       "doublequoted": "\n",
       "singlequoted": "'\n'",
-      "overflow": null
+      "overflow": ""
     },
     {
       "name": "return",
       "unquoted": "\r",
       "doublequoted": "\r",
       "singlequoted": "'\r'",
-      "overflow": null
+      "overflow": ""
     },
     {
       "name": "tab",
       "unquoted": "\t",
       "doublequoted": "\t",
       "singlequoted": "'\t'",
-      "overflow": null
+      "overflow": ""
     },
     {
       "name": "misc",
       "unquoted": "\\m",
       "doublequoted": "\\m",
       "singlequoted": "'\\m'",
-      "overflow": null
+      "overflow": ""
     },
     {
       "name": "quote1",
       "unquoted": "N/A",
       "doublequoted": "x,,z",
       "singlequoted": "N/A",
-      "overflow": null
+      "overflow": ""
     },
     {
       "name": "quotelong",
       "unquoted": "N/A",
       "doublequoted": "x,,z,N/A\nquotlongcontinued,",
       "singlequoted": "N/A",
-      "overflow": null
+      "overflow": ""
     },
     {
       "name": "comma",
diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py
index c739c8504..89de7b8a0 100644
--- a/test/test_bundle_utils.py
+++ b/test/test_bundle_utils.py
@@ -326,7 +326,7 @@ def test_load_items():
     SAMPLE_CSV_FILE2_SHEET_NAME: [
         {"name": "john", "sex": "M", "member": "false"},
         {"name": "juan", "sex": "male", "member": "true"},
-        {"name": "igor", "sex": "unknown", "member": None},
+        {"name": "igor", "sex": "unknown", "member": ""},
         {"name": "mary", "sex": "Female", "member": "t"}
     ]
 }
@@ -335,7 +335,7 @@ def test_load_items():
     SAMPLE_CSV_FILE2_SHEET_NAME: [
         {"name": "john", "sex": "M", "member": False},
         {"name": "juan", "sex": "male", "member": True},
-        {"name": "igor", "sex": "unknown", "member": ""},
+        {"name": "igor", "sex": "unknown"},
         {"name": "mary", "sex": "Female", "member": "t"}
     ]
 }
@@ -344,7 +344,7 @@ def test_load_items():
     "Person": [
         {"name": "john", "sex": "Male", "member": False},
         {"name": "juan", "sex": "Male", "member": True},
-        {"name": "igor", "sex": "unknown", "member": ""},
+        {"name": "igor", "sex": "unknown"},
         {"name": "mary", "sex": "Female", "member": True}
     ]
 }
@@ -466,6 +466,7 @@ def test_load_items_with_schema():
     print("Case 3")
     expected = SAMPLE_CSV_FILE2_PERSON_CONTENT_HINTED
     actual = load_items(SAMPLE_CSV_FILE2, override_schemas=SAMPLE_CSV_FILE2_SCHEMAS, tab_name='Person')
+    import pdb ; pdb.set_trace()
     assert actual == expected
 
 
diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py
index 6cb2405e3..276b17b23 100644
--- a/test/test_sheet_utils.py
+++ b/test/test_sheet_utils.py
@@ -117,25 +117,25 @@ def test_table_set_manager_registry_manager_for_filename():
 
 SAMPLE_XLSX_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.xlsx')
 
-old_SAMPLE_XLSX_FILE_RAW_CONTENT = {
+backup_SAMPLE_XLSX_FILE_RAW_CONTENT = {
     "Sheet1": [
-        {"x": 1, "y.a": 1, "y.z": 1},
-        {"x": 1, "y.a": 2, "y.z": 3},
+        {"x": "1", "y.a": "1", "y.z": "1"},
+        {"x": "1", "y.a": "2", "y.z": "3"},
         {"x": "alpha", "y.a": "beta", "y.z": "gamma|delta"},
     ],
     "Sheet2": [
         {
-            "name": "bill", "age": 23,
-            "mother.name": "mary", "mother.age": 58,
-            "father.name": "fred", "father.age": 63,
-            "friends#0.name": "sam", "friends#0.age": 22,
-            "friends#1.name": "arthur", "friends#1.age": 19,
+            "name": "bill", "age": "23",
+            "mother.name": "mary", "mother.age": "58",
+            "father.name": "fred", "father.age": "63",
+            "friends#0.name": "sam", "friends#0.age": "22",
+            "friends#1.name": "arthur", "friends#1.age": "19",
         },
         {
-            "name": "joe", "age": 9,
-            "mother.name": "estrella", "mother.age": 35,
-            "father.name": "anthony", "father.age": 34,
-            "friends#0.name": "anders", "friends#0.age": 9,
+            "name": "joe", "age": "9",
+            "mother.name": "estrella", "mother.age": "35",
+            "father.name": "anthony", "father.age": "34",
+            "friends#0.name": "anders", "friends#0.age": "9",
             "friends#1.name": None, "friends#1.age": None,
         },
     ]
@@ -159,7 +159,7 @@ def test_table_set_manager_registry_manager_for_filename():
             "mother.name": "estrella", "mother.age": "35",
             "father.name": "anthony", "father.age": "34",
             "friends#0.name": "anders", "friends#0.age": "9",
-            "friends#1.name": None, "friends#1.age": None,
+            "friends#1.name": "", "friends#1.age": "",
         },
     ]
 }
@@ -186,7 +186,7 @@ def test_table_set_manager_registry_manager_for_filename():
             "father": {"name": "anthony", "age": 34},
             "friends": [
                 {"name": "anders", "age": 9},
-                {"name": None, "age": None}
+                {"name": "", "age": ""}
             ]
         },
     ],

From 23d29e82f742cf89cec741af51af15d272189738 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Sat, 11 Nov 2023 14:59:02 -0500
Subject: [PATCH 10/49] Changes to bundle_utils/etc related to SMaHT ingestion
 work.

---
 dcicutils/sheet_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py
index b3f191f15..c0d7d0a80 100644
--- a/dcicutils/sheet_utils.py
+++ b/dcicutils/sheet_utils.py
@@ -432,7 +432,7 @@ def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> A
     @staticmethod
     def is_terminating_row(row: List[Optional[Any]]) -> bool:
         # TODO: This is may change; currently an all blank row signals the end of input.
-        return all(cell is None for cell in row) and XlsxManager.TERMINATE_ON_EMPTY_ROW
+        return all(cell is None or cell == "" for cell in row) and XlsxManager.TERMINATE_ON_EMPTY_ROW
 
     @staticmethod
     def is_comment_row(row: Tuple[Optional[Any]]) -> bool:

From d80817e1b5c6f8d2387ee4da1ff66cb5cfa1cc88 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Sat, 11 Nov 2023 23:09:13 -0500
Subject: [PATCH 11/49] Changes to bundle_utils/etc related to SMaHT ingestion
 work.

---
 dcicutils/bundle_utils.py | 51 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 4 deletions(-)

diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py
index 6e0dc4553..92953b5b3 100644
--- a/dcicutils/bundle_utils.py
+++ b/dcicutils/bundle_utils.py
@@ -1,3 +1,4 @@
+import json
 import copy
 
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -45,6 +46,9 @@ def __init__(self, problems: Optional[dict] = None):
 
 
 class TypeHint:
+    def __init__(self):
+        self.is_array = False
+
     def apply_hint(self, value):
         return value
 
@@ -56,6 +60,8 @@ def __repr__(self):
 
 
 class BoolHint(TypeHint):
+    def __init__(self):
+        super().__init__()
 
     # We could use other ways to do this, such as initial substring, but this is more likely to be right.
     # Then again, we might want to consder accepting athers like 'yes/no', 'y/n', 'on/off', '1/0'.
@@ -80,6 +86,7 @@ def __init__(self, declared_type: Optional[str] = None):
         if declared_type is None:
             declared_type = 'number'
         self.preferred_type = self.PREFERENCE_MAP.get(declared_type)
+        super().__init__()
 
     def apply_hint(self, value):
         if isinstance(value, str) and value:
@@ -97,6 +104,7 @@ def __str__(self):
 
     def __init__(self, value_map):
         self.value_map = value_map
+        super().__init__()
 
     def apply_hint(self, value):
         if isinstance(value, str):
@@ -117,6 +125,9 @@ def apply_hint(self, value):
 
 
 class ArrayHint(TypeHint):
+    def __init__(self):
+        super().__init__()
+
     def apply_hint(self, value):
         if value is None or value == '':
             return []
@@ -126,6 +137,9 @@ def apply_hint(self, value):
 
 
 class StringHint(TypeHint):
+    def __init__(self):
+        super().__init__()
+
     def apply_hint(self, value):
         return str(value).strip() if value is not None else ""
 
@@ -138,9 +152,16 @@ def __str__(self):
     def __init__(self, schema_name: str, context: TypeHintContext):
         self.schema_name = schema_name
         self.context = context
+        super().__init__()
 
     def apply_hint(self, value):
-        if not self.context.validate_ref(item_type=self.schema_name, item_ref=value):
+        if self.is_array:
+            value = [value.strip() for value in value.split(ARRAY_VALUE_DELIMITER)]
+            for item in value:
+                if item and not self.context.validate_ref(item_type=self.schema_name, item_ref=item):
+                    raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {item!r}")
+            return value
+        if value and not self.context.validate_ref(item_type=self.schema_name, item_ref=value):
             raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {value!r}")
         return value
 
@@ -352,6 +373,10 @@ def find_type_hint_for_subschema(cls, subschema: Any, context: Optional[TypeHint
             elif t == 'boolean':
                 return BoolHint()
             elif t == 'array':
+                array_type_hint = cls.find_type_hint_for_subschema(subschema.get("items"), context)
+                if type(array_type_hint) == RefHint:
+                    array_type_hint.is_array = True
+                    return array_type_hint
                 return ArrayHint()
 
     @classmethod
@@ -527,7 +552,15 @@ def contains_ref(self, item_type, item_ref):
     def resolve_ref(self, item_type, item_ref):
         lookup_table = self.lookup_tables_by_tab_name.get(item_type)
         if lookup_table:  # Is it a type we're tracking?
-            return lookup_table.get(item_ref) or None
+            if isinstance(item_ref, list):
+                found = True
+                for item in item_ref:
+                    if not lookup_table.get(item):
+                        found = False
+                        break
+                return True if found else None
+            else:
+                return lookup_table.get(item_ref) or None
         else:  # Apparently some stray type not in our tables
             return None
 
@@ -553,8 +586,18 @@ def validate_ref(self, item_type, item_ref):
             return True
         try:
             # TODO: This probably needs a cache
-            info = get_metadata(f"/{to_camel_case(item_type)}/{item_ref}",
-                                ff_env=self.portal_env, vapp=self.portal_vapp)
+            if isinstance(item_ref, list):
+                found = True
+                for item in item_ref:
+                    info = get_metadata(f"/{to_camel_case(item_type)}/{item_ref}",
+                                        ff_env=self.portal_env, vapp=self.portal_vapp)
+                    if not isinstance(info, dict) or 'uuid' not in info:
+                        found = False
+                        break
+                return found
+            else:
+                info = get_metadata(f"/{to_camel_case(item_type)}/{item_ref}",
+                                    ff_env=self.portal_env, vapp=self.portal_vapp)
             # Basically return True if there's a value at all,
             # but still check it's not an error message that didn't get raised.
             return isinstance(info, dict) and 'uuid' in info

From 7260521cc3abfd6d4f7ee9d73b81758f4a87e924 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Sat, 11 Nov 2023 23:10:13 -0500
Subject: [PATCH 12/49] Changes to bundle_utils/etc related to SMaHT ingestion
 work.

---
 test/test_bundle_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py
index 89de7b8a0..3edd4acc0 100644
--- a/test/test_bundle_utils.py
+++ b/test/test_bundle_utils.py
@@ -466,7 +466,6 @@ def test_load_items_with_schema():
     print("Case 3")
     expected = SAMPLE_CSV_FILE2_PERSON_CONTENT_HINTED
     actual = load_items(SAMPLE_CSV_FILE2, override_schemas=SAMPLE_CSV_FILE2_SCHEMAS, tab_name='Person')
-    import pdb ; pdb.set_trace()
     assert actual == expected
 
 

From 676b7ceb57343052fd8166f619144adcc4ecae75 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Mon, 13 Nov 2023 14:52:38 -0500
Subject: [PATCH 13/49] Minor sheet_utils updates related to SMaHT ingestion.

---
 dcicutils/bundle_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py
index 92953b5b3..82e115e65 100644
--- a/dcicutils/bundle_utils.py
+++ b/dcicutils/bundle_utils.py
@@ -132,6 +132,8 @@ def apply_hint(self, value):
         if value is None or value == '':
             return []
         if isinstance(value, str):
+            if not value:
+                return []
             return [value.strip() for value in value.split(ARRAY_VALUE_DELIMITER)]
         return super().apply_hint(value)
 
@@ -156,7 +158,7 @@ def __init__(self, schema_name: str, context: TypeHintContext):
 
     def apply_hint(self, value):
         if self.is_array:
-            value = [value.strip() for value in value.split(ARRAY_VALUE_DELIMITER)]
+            value = [value.strip() for value in value.split(ARRAY_VALUE_DELIMITER)] if value else []
             for item in value:
                 if item and not self.context.validate_ref(item_type=self.schema_name, item_ref=item):
                     raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {item!r}")
@@ -589,7 +591,7 @@ def validate_ref(self, item_type, item_ref):
             if isinstance(item_ref, list):
                 found = True
                 for item in item_ref:
-                    info = get_metadata(f"/{to_camel_case(item_type)}/{item_ref}",
+                    info = get_metadata(f"/{to_camel_case(item_type)}/{item}",
                                         ff_env=self.portal_env, vapp=self.portal_vapp)
                     if not isinstance(info, dict) or 'uuid' not in info:
                         found = False

From 352c8016fde503ba5e7cf2afad6b657df0c55bc7 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Mon, 13 Nov 2023 14:58:26 -0500
Subject: [PATCH 14/49] lint fixes

---
 dcicutils/bundle_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py
index 82e115e65..ddb8fda68 100644
--- a/dcicutils/bundle_utils.py
+++ b/dcicutils/bundle_utils.py
@@ -1,4 +1,3 @@
-import json
 import copy
 
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -505,7 +504,7 @@ def __init__(self, tabbed_sheet_data: TabbedSheetData, *, flattened: bool,
             self.schema_manager: SchemaManager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp,
                                                                override_schemas=override_schemas, noschemas=noschemas)
             schema_names_to_fetch = [key for key, value in tabbed_sheet_data.items() if value]
-            self.schemas = self.schema_manager.fetch_relevant_schemas(schema_names_to_fetch) if not noschemas else None # , schemas=schemas)
+            self.schemas = self.schema_manager.fetch_relevant_schemas(schema_names_to_fetch) if not noschemas else None
         else:
             self.schema_manager = None
             self.schemas = {}

From 28b780bf75bc2cfa8767c3bc3d75f0efbcc31d72 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Tue, 14 Nov 2023 12:56:20 -0500
Subject: [PATCH 15/49] SMaHT ingestion work.

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 122ce5164..259063fbb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dcicutils"
-version = "8.2.0.1b1"
+version = "8.2.0.1b1"  # TODO: To become 8.3.0
 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources"
 authors = ["4DN-DCIC Team <support@4dnucleome.org>"]
 license = "MIT"

From 8d3d4322a46ca33259ae1f81acd0eda288befe4c Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Wed, 15 Nov 2023 10:59:10 -0500
Subject: [PATCH 16/49] Support ordering in sheet_utils.

---
 dcicutils/bundle_utils.py |  3 ++-
 dcicutils/sheet_utils.py  | 39 ++++++++++++++++++++++++---------------
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py
index ddb8fda68..15863a805 100644
--- a/dcicutils/bundle_utils.py
+++ b/dcicutils/bundle_utils.py
@@ -718,9 +718,10 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional
                validate: bool = False,
                retain_empty_properties: bool = False,
                noschemas: bool = False,
+               sheet_order: Optional[List[str]] = None,
                **kwargs):
     annotated_data = TableSetManager.load_annotated(filename=filename, tab_name=tab_name, escaping=escaping,
-                                                    prefer_number=False, **kwargs)
+                                                    prefer_number=False, sheet_order=sheet_order, **kwargs)
     tabbed_rows = annotated_data['content']
     flattened = annotated_data['flattened']
     if flattened:
diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py
index c0d7d0a80..07b15a993 100644
--- a/dcicutils/sheet_utils.py
+++ b/dcicutils/sheet_utils.py
@@ -17,7 +17,7 @@
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union
 from .common import AnyJsonData, Regexp, JsonSchema
 from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize
-from .misc_utils import ignored, pad_to, JsonLinesReader, remove_suffix
+from .misc_utils import ignored, pad_to, JsonLinesReader, remove_suffix, to_camel_case
 
 
 Header = str
@@ -199,10 +199,10 @@ def load(cls, filename: str, **kwargs) -> TabbedSheetData:
         raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.")  # noQA
 
     @property
-    def tab_names(self) -> List[str]:
+    def tab_names(self, order: Optional[List[str]] = None) -> List[str]:
         raise NotImplementedError(f".tab_names is not implemented for {self.__class__.__name__}..")  # noQA
 
-    def load_content(self) -> Any:
+    def load_content(self, sheet_order: Optional[List[str]] = None) -> Any:
         raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.")  # noQA
 
 
@@ -291,8 +291,8 @@ def _process_row(self, tab_name: str, state: Any, row: List[SheetCellValue]) ->
         """
         raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.")  # noQA
 
-    def load_content(self) -> AnyJsonData:
-        for tab_name in self.tab_names:
+    def load_content(self, sheet_order: Optional[List[str]] = None) -> AnyJsonData:
+        for tab_name in self.tab_names(sheet_order):
             sheet_content = []
             state = self._create_tab_processor_state(tab_name)
             for row_data in self._raw_row_generator_for_tab_name(tab_name):
@@ -378,9 +378,20 @@ def _all_cols(cls, sheet: Worksheet):
         for col in range(1, col_max + 1):
             yield col
 
-    @property
-    def tab_names(self) -> List[str]:
-        return self.reader_agent.sheetnames
+    def tab_names(self, order: Optional[List[str]] = None) -> List[str]:
+        def ordered_sheet_names(sheet_names: List[str]) -> List[str]:
+            if not order:
+                return sheet_names
+            ordered_sheet_names = []
+            for item in order:
+                for sheet_name in sheet_names:
+                    if to_camel_case(item.replace(" ", "")) == to_camel_case(sheet_name.replace(" ", "")):
+                        ordered_sheet_names.append(sheet_name)
+            for sheet_name in sheet_names:
+                if sheet_name not in ordered_sheet_names:
+                    ordered_sheet_names.append(sheet_name)
+            return ordered_sheet_names
+        return ordered_sheet_names(self.reader_agent.sheetnames)
 
     def _get_reader_agent(self) -> Workbook:
         return openpyxl.load_workbook(self.filename)
@@ -464,8 +475,7 @@ def __init__(self, filename: str, tab_name: Optional[str] = None, **kwargs):
         self._tab_name = tab_name or infer_tab_name_from_filename(filename)
         super().__init__(filename=filename, **kwargs)
 
-    @property
-    def tab_names(self) -> List[str]:
+    def tab_names(self, order: Optional[List[str]] = None) -> List[str]:
         return [self._tab_name]
 
 
@@ -492,8 +502,7 @@ def _wrap_inserts_data(cls, filename: str, data: AnyJsonData) -> AnyJsonData:
         ignored(filename)
         return data
 
-    @property
-    def tab_names(self) -> List[str]:
+    def tab_names(self, order: Optional[List[str]] = None) -> List[str]:
         return list(self.content_by_tab_name.keys())
 
     def _get_reader_agent(self) -> Any:
@@ -512,7 +521,7 @@ def extract_tabbed_headers(cls, data: TabbedSheetData) -> TabbedHeaders:
             result[tab] = headers
         return result
 
-    def load_content(self) -> Dict[str, AnyJsonData]:
+    def load_content(self, sheet_order: Optional[List[str]] = None) -> Dict[str, AnyJsonData]:
         data = self._load_inserts_data(self.filename)
         self.content_by_tab_name = data
         self.headers_by_tab_name = self.extract_tabbed_headers(data)
@@ -743,7 +752,7 @@ def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[
 
     @classmethod
     def load_annotated(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None,
-                       retain_empty_properties: bool = False,
+                       retain_empty_properties: bool = False, sheet_order: Optional[List[str]] = None,
                        **kwargs) -> Dict:
         """
         Given a filename and various options
@@ -752,7 +761,7 @@ def load_annotated(cls, filename: str, tab_name: Optional[str] = None, escaping:
         with maybe_unpack(filename) as filename:
             manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping,
                                                         **kwargs)
-            content: TabbedSheetData = manager.load_content()
+            content: TabbedSheetData = manager.load_content(sheet_order)
             return {
                 'filename': filename,
                 'content': content,

From 728c5bbd22baaeabdf6a0df319944006b3006c2e Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Wed, 15 Nov 2023 16:52:23 -0500
Subject: [PATCH 17/49] Minor sheet_utils updates related to SMaHT ingestion.

---
 dcicutils/bundle_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py
index 15863a805..5240790b7 100644
--- a/dcicutils/bundle_utils.py
+++ b/dcicutils/bundle_utils.py
@@ -162,6 +162,10 @@ def apply_hint(self, value):
                 if item and not self.context.validate_ref(item_type=self.schema_name, item_ref=item):
                     raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {item!r}")
             return value
+        self._apply_ref_hint(value)
+        return value
+
+    def _apply_ref_hint(self, value):
         if value and not self.context.validate_ref(item_type=self.schema_name, item_ref=value):
             raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {value!r}")
         return value

From bb7ebc909bceb30f5d487e1e4aa5ebcea98a16cc Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Wed, 15 Nov 2023 17:06:57 -0500
Subject: [PATCH 18/49] Minor sheet_utils updates related to SMaHT ingestion.

---
 dcicutils/bundle_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py
index 5240790b7..2f2b33987 100644
--- a/dcicutils/bundle_utils.py
+++ b/dcicutils/bundle_utils.py
@@ -750,7 +750,7 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional
 def remove_empty_properties(data: Optional[Union[list, dict]]) -> None:
     if isinstance(data, dict):
         for key in list(data.keys()):
-            if (value := data[key]) in [None, ""]:
+            if (value := data[key]) in [None, "", {}, []]:
                 del data[key]
             else:
                 remove_empty_properties(value)

From beb9cf7a735781a6440602c3c38bd30584cdd8b6 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Wed, 15 Nov 2023 17:35:08 -0500
Subject: [PATCH 19/49] Minor sheet_utils updates related to SMaHT ingestion.

---
 dcicutils/bundle_utils.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py
index 2f2b33987..c9066f2ec 100644
--- a/dcicutils/bundle_utils.py
+++ b/dcicutils/bundle_utils.py
@@ -158,14 +158,15 @@ def __init__(self, schema_name: str, context: TypeHintContext):
     def apply_hint(self, value):
         if self.is_array:
             value = [value.strip() for value in value.split(ARRAY_VALUE_DELIMITER)] if value else []
-            for item in value:
-                if item and not self.context.validate_ref(item_type=self.schema_name, item_ref=item):
-                    raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {item!r}")
             return value
         self._apply_ref_hint(value)
         return value
 
     def _apply_ref_hint(self, value):
+        if self.is_array:
+            for item in value:
+                if item and not self.context.validate_ref(item_type=self.schema_name, item_ref=item):
+                    raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {item!r}")
         if value and not self.context.validate_ref(item_type=self.schema_name, item_ref=value):
             raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {value!r}")
         return value
@@ -269,7 +270,7 @@ def parse_sheet_headers(cls, headers: Headers) -> ParsedHeaders:
     def compute_patch_prototype(cls, parsed_headers: ParsedHeaders):
         prototype = {}
         for parsed_header in parsed_headers:
-            parsed_header0 = parsed_header[0]
+            parsed_header0 = parsed_header[0] if parsed_header else ""
             if isinstance(parsed_header0, int):
                 raise LoadTableError(f"A header cannot begin with a numeric ref: {parsed_header0}")
             cls.assure_patch_prototype_shape(parent=prototype, keys=parsed_header)
@@ -277,7 +278,9 @@ def compute_patch_prototype(cls, parsed_headers: ParsedHeaders):
 
     @classmethod
     def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: ParsedHeader):
-        [key0, *more_keys] = keys
+        key0 = None
+        more_keys = None
+        [key0, *more_keys] = keys if keys else [None]
         key1 = more_keys[0] if more_keys else None
         if isinstance(key1, int):
             placeholder = []
@@ -354,7 +357,7 @@ def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any
         # if (value is None or value == '') and not force:
         if value is None and not force:
             return
-        [key, *more_path] = path
+        [key, *more_path] = path if path else [None]
         if not more_path:
             datum[key] = value
         else:

From 023bc78d4b2cd706f993641f7b689ddd8a61c42f Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Thu, 16 Nov 2023 08:56:40 -0500
Subject: [PATCH 20/49] Minor sheet_utils updates related to SMaHT ingestion.

---
 dcicutils/bundle_utils.py     | 26 +++++++++-----------------
 dcicutils/validation_utils.py | 10 +++++++---
 2 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py
index c9066f2ec..93b08a2e0 100644
--- a/dcicutils/bundle_utils.py
+++ b/dcicutils/bundle_utils.py
@@ -485,8 +485,7 @@ class TableChecker(InflatableTabbedDataManager, TypeHintContext):
     def __init__(self, tabbed_sheet_data: TabbedSheetData, *, flattened: bool,
                  override_schemas: Optional[TabbedJsonSchemas] = None,
                  portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None,
-                 apply_heuristics: bool = False,
-                 noschemas: bool = False):
+                 apply_heuristics: bool = False):
 
         self.flattened = flattened
         # if not flattened:
@@ -496,7 +495,7 @@ def __init__(self, tabbed_sheet_data: TabbedSheetData, *, flattened: bool,
         #     # -kmp 25-Oct-2023
         #     raise ValueError("Only flattened=True is supported by TableChecker for now.")
 
-        if portal_env is None and portal_vapp is None and not noschemas:
+        if portal_env is None and portal_vapp is None:
             portal_env = public_env_name(EnvUtils.PRD_ENV_NAME)
         # InflatableTabbedDataManager supplies:
         #   self.tabbed_sheet_data: TabbedSheetData =
@@ -507,14 +506,10 @@ def __init__(self, tabbed_sheet_data: TabbedSheetData, *, flattened: bool,
         super().__init__(tabbed_sheet_data=tabbed_sheet_data, apply_heuristics=apply_heuristics)
         self.portal_env = portal_env
         self.portal_vapp = portal_vapp
-        if not noschemas:
-            self.schema_manager: SchemaManager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp,
-                                                               override_schemas=override_schemas, noschemas=noschemas)
-            schema_names_to_fetch = [key for key, value in tabbed_sheet_data.items() if value]
-            self.schemas = self.schema_manager.fetch_relevant_schemas(schema_names_to_fetch) if not noschemas else None
-        else:
-            self.schema_manager = None
-            self.schemas = {}
+        self.schema_manager: SchemaManager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp,
+                                                           override_schemas=override_schemas)
+        schema_names_to_fetch = [key for key, value in tabbed_sheet_data.items() if value]
+        self.schemas = self.schema_manager.fetch_relevant_schemas(schema_names_to_fetch)
         self.lookup_tables_by_tab_name: Dict[str, Dict[str, Dict]] = {
             tab_name: self.build_lookup_table_for_tab(tab_name, rows=rows)
             for tab_name, rows in tabbed_sheet_data.items()
@@ -676,11 +671,10 @@ def check(cls, tabbed_sheet_data: TabbedSheetData, *,
               flattened: bool,
               override_schemas: Optional[TabbedJsonSchemas] = None,
               apply_heuristics: bool = False,
-              noschemas: bool = False,
               portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None):
         checker = cls(tabbed_sheet_data, flattened=flattened,
                       override_schemas=override_schemas, apply_heuristics=apply_heuristics,
-                      portal_env=portal_env, portal_vapp=portal_vapp, noschemas=noschemas)
+                      portal_env=portal_env, portal_vapp=portal_vapp)
         checked = checker.check_tabs()
         return checked
 
@@ -724,7 +718,6 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional
                #       but for production use maybe should not be? -kmp 25-Oct-2023
                validate: bool = False,
                retain_empty_properties: bool = False,
-               noschemas: bool = False,
                sheet_order: Optional[List[str]] = None,
                **kwargs):
     annotated_data = TableSetManager.load_annotated(filename=filename, tab_name=tab_name, escaping=escaping,
@@ -735,15 +728,14 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional
         checked_items = TableChecker.check(tabbed_rows, flattened=flattened,
                                            override_schemas=override_schemas,
                                            portal_env=portal_env, portal_vapp=portal_vapp,
-                                           apply_heuristics=apply_heuristics,
-                                           noschemas=noschemas)
+                                           apply_heuristics=apply_heuristics)
     else:
         # No fancy checking for things like .json, etc. for now. Only check things that came from
         # spreadsheet-like data, where structural datatypes are forced into strings.
         checked_items = tabbed_rows
     if not retain_empty_properties:
         remove_empty_properties(checked_items)
-    if validate and not noschemas:
+    if validate:
         problems = validate_data_against_schemas(checked_items, portal_env=portal_env, portal_vapp=portal_vapp,
                                                  override_schemas=override_schemas)
         return checked_items, problems
diff --git a/dcicutils/validation_utils.py b/dcicutils/validation_utils.py
index e3a0e050c..abe6bcb2e 100644
--- a/dcicutils/validation_utils.py
+++ b/dcicutils/validation_utils.py
@@ -27,10 +27,10 @@ def fresh_schema_manager_context_for_testing(cls):
         # finally:
         #     cls.SCHEMA_CACHE = old_schema_cache
 
-    def __init__(self, *, override_schemas: Optional[TabbedJsonSchemas] = None, noschemas: bool = False,
+    def __init__(self, *, override_schemas: Optional[TabbedJsonSchemas] = None,
                  portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None):
         self.SCHEMA_CACHE = {}  # Shared cache. Do not override. Use .clear_schema_cache() to clear it.
-        if portal_env is None and portal_vapp is None and not noschemas:
+        if portal_env is None and portal_vapp is None:
             portal_env = public_env_name(EnvUtils.PRD_ENV_NAME)
             PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.")
         self.portal_env = portal_env
@@ -64,10 +64,14 @@ def fetch_schema(self, schema_name: str):
         schema: Optional[AnyJsonData] = self.SCHEMA_CACHE.get(schema_name)
         if schema is None and schema_name not in self.SCHEMA_CACHE:  # If None is already stored, don't look it up again
             schema_name = schema_name.replace(" ", "")
-            schema = get_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp)
+            schema = SchemaManager.get_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp)
             self.SCHEMA_CACHE[schema_name] = schema
         return schema
 
+    @staticmethod
+    def get_schema(name: str, portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None):
+        return get_schema(name, portal_env=portal_env, portal_vapp=portal_vapp)
+
     # Should not be needed, given that SCHEMA_CACHE is an instance variable.
     #
     # @classmethod

From 5f432bf42039d195875a8e87d98ed4c501bbb5e3 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Thu, 16 Nov 2023 15:10:54 -0500
Subject: [PATCH 21/49] Minor sheet_utils updates related to SMaHT ingestion;
 handle missing refs.

---
 dcicutils/bundle_utils.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py
index 93b08a2e0..577c43dc2 100644
--- a/dcicutils/bundle_utils.py
+++ b/dcicutils/bundle_utils.py
@@ -150,9 +150,10 @@ class RefHint(TypeHint):
     def __str__(self):
         return f"<RefHint {self.schema_name} context={self.context}>"
 
-    def __init__(self, schema_name: str, context: TypeHintContext):
+    def __init__(self, schema_name: str, required: bool, context: TypeHintContext):
         self.schema_name = schema_name
         self.context = context
+        self.required = required
         super().__init__()
 
     def apply_hint(self, value):
@@ -163,6 +164,8 @@ def apply_hint(self, value):
         return value
 
     def _apply_ref_hint(self, value):
+        if not value and self.required:
+            raise ValidationProblem(f"Missing required {self.schema_name} reference")
         if self.is_array:
             for item in value:
                 if item and not self.context.validate_ref(item_type=self.schema_name, item_ref=item):
@@ -364,7 +367,8 @@ def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any
             cls.set_path_value(datum[key], more_path, value)
 
     @classmethod
-    def find_type_hint_for_subschema(cls, subschema: Any, context: Optional[TypeHintContext] = None):
+    def find_type_hint_for_subschema(cls, subschema: Any, required: bool = False,
+                                     context: Optional[TypeHintContext] = None):
         if subschema is not None:
             t = subschema.get('type')
             if t == 'string':
@@ -374,14 +378,14 @@ def find_type_hint_for_subschema(cls, subschema: Any, context: Optional[TypeHint
                     return EnumHint(mapping)
                 link_to = subschema.get('linkTo')
                 if link_to and context.schema_exists(link_to):
-                    return RefHint(schema_name=link_to, context=context)
+                    return RefHint(schema_name=link_to, required=required, context=context)
                 return StringHint()
             elif t in ('integer', 'number'):
                 return NumHint(declared_type=t)
             elif t == 'boolean':
                 return BoolHint()
             elif t == 'array':
-                array_type_hint = cls.find_type_hint_for_subschema(subschema.get("items"), context)
+                array_type_hint = cls.find_type_hint_for_subschema(subschema.get("items"), required=required, context=context)
                 if type(array_type_hint) == RefHint:
                     array_type_hint.is_array = True
                     return array_type_hint
@@ -399,7 +403,8 @@ def finder(subheader, subschema):
                     if subschema.get('type') == 'object':
                         subsubschema = subschema.get('properties', {}).get(key1)
                         if not other_headers:
-                            hint = cls.find_type_hint_for_subschema(subsubschema, context=context)
+                            required = key1 and subschema and key1 in subschema.get('required', [])
+                            hint = cls.find_type_hint_for_subschema(subsubschema, required=required, context=context)
                             if hint:
                                 return hint
                             else:
@@ -655,6 +660,9 @@ def check_flattened_row(self, row: Dict, *, tab_name: str, row_number: int, prot
         patch_item = copy.deepcopy(prototype)
         for column_number, column_value in enumerate(row.values()):
             parsed_value = ItemTools.parse_item_value(column_value, apply_heuristics=self.apply_heuristics)
+            if len(row) > column_number and (list(row.keys())[column_number] or "").endswith("#"):
+                if isinstance(parsed_value, str):
+                    parsed_value = [value.strip() for value in parsed_value.split(ARRAY_VALUE_DELIMITER) if value]
             type_hint = type_hints[column_number]
             if type_hint:
                 try:

From 8e2141d5f3c05242aabfa2c101562cc55c629134 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Thu, 16 Nov 2023 17:16:18 -0500
Subject: [PATCH 22/49] Minor sheet_utils updates related to SMaHT ingestion;
 array split related.

---
 dcicutils/bundle_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py
index 577c43dc2..d2f3b456e 100644
--- a/dcicutils/bundle_utils.py
+++ b/dcicutils/bundle_utils.py
@@ -157,9 +157,8 @@ def __init__(self, schema_name: str, required: bool, context: TypeHintContext):
         super().__init__()
 
     def apply_hint(self, value):
-        if self.is_array:
+        if self.is_array and isinstance(value, str):
             value = [value.strip() for value in value.split(ARRAY_VALUE_DELIMITER)] if value else []
-            return value
         self._apply_ref_hint(value)
         return value
 

From df5890497a4f5e334111d659eb6e43e4f23bed0d Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Fri, 17 Nov 2023 12:53:17 -0500
Subject: [PATCH 23/49] flake8 fixes

---
 dcicutils/bundle_utils.py | 5 +++--
 dcicutils/qa_utils.py     | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py
index d2f3b456e..9f66fb4e0 100644
--- a/dcicutils/bundle_utils.py
+++ b/dcicutils/bundle_utils.py
@@ -384,8 +384,9 @@ def find_type_hint_for_subschema(cls, subschema: Any, required: bool = False,
             elif t == 'boolean':
                 return BoolHint()
             elif t == 'array':
-                array_type_hint = cls.find_type_hint_for_subschema(subschema.get("items"), required=required, context=context)
-                if type(array_type_hint) == RefHint:
+                array_type_hint = cls.find_type_hint_for_subschema(subschema.get("items"),
+                                                                   required=required, context=context)
+                if type(array_type_hint) is RefHint:
                     array_type_hint.is_array = True
                     return array_type_hint
                 return ArrayHint()
diff --git a/dcicutils/qa_utils.py b/dcicutils/qa_utils.py
index 0c9b019ed..ac18f3b19 100644
--- a/dcicutils/qa_utils.py
+++ b/dcicutils/qa_utils.py
@@ -3540,7 +3540,7 @@ def recurse(json1, json2, path=""):
                 if not result:
                     # out(f"Recursive failure at {path!r} in list comparison")
                     pass
-        elif type(json1) == type(json2):
+        elif type(json1) is type(json2):
             result = json1 == json2
             if not result:
                 out(f"Failed at {path!r} due to value mismatch: {json.dumps(json1)} != {json.dumps(json2)}")

From 74f504b11c5cd8d0dfa297acee77ef00e80cf109 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Fri, 17 Nov 2023 18:21:26 -0500
Subject: [PATCH 24/49] Minor sheet_utils updates related to SMaHT ingestion;
 handle missing refs.

---
 dcicutils/bundle_utils.py | 46 +++++++++++++++++++++------------------
 test/test_bundle_utils.py |  8 +++++--
 2 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py
index 9f66fb4e0..de42c4280 100644
--- a/dcicutils/bundle_utils.py
+++ b/dcicutils/bundle_utils.py
@@ -48,7 +48,7 @@ class TypeHint:
     def __init__(self):
         self.is_array = False
 
-    def apply_hint(self, value):
+    def apply_hint(self, value, src):
         return value
 
     def __str__(self):
@@ -67,14 +67,14 @@ def __init__(self):
     TRUE_VALUES = ['true', 't']
     FALSE_VALUES = ['false', 'f']
 
-    def apply_hint(self, value):
+    def apply_hint(self, value, src):
         if isinstance(value, str) and value:
             l_value = value.lower()
             if l_value in self.TRUE_VALUES:
                 return True
             elif l_value in self.FALSE_VALUES:
                 return False
-        return super().apply_hint(value)
+        return super().apply_hint(value, src)
 
 
 class NumHint(TypeHint):
@@ -87,13 +87,13 @@ def __init__(self, declared_type: Optional[str] = None):
         self.preferred_type = self.PREFERENCE_MAP.get(declared_type)
         super().__init__()
 
-    def apply_hint(self, value):
+    def apply_hint(self, value, src):
         if isinstance(value, str) and value:
             if self.preferred_type:
                 return prefer_number(value, kind=self.preferred_type)
             else:
                 return value
-        return super().apply_hint(value)
+        return super().apply_hint(value, src)
 
 
 class EnumHint(TypeHint):
@@ -105,7 +105,7 @@ def __init__(self, value_map):
         self.value_map = value_map
         super().__init__()
 
-    def apply_hint(self, value):
+    def apply_hint(self, value, src):
         if isinstance(value, str):
             if value in self.value_map:
                 result = self.value_map[value]
@@ -120,28 +120,28 @@ def apply_hint(self, value):
                     [only_found] = found
                     result = self.value_map[only_found]
                     return result
-        return super().apply_hint(value)
+        return super().apply_hint(value, src)
 
 
 class ArrayHint(TypeHint):
     def __init__(self):
         super().__init__()
 
-    def apply_hint(self, value):
+    def apply_hint(self, value, src):
         if value is None or value == '':
             return []
         if isinstance(value, str):
             if not value:
                 return []
             return [value.strip() for value in value.split(ARRAY_VALUE_DELIMITER)]
-        return super().apply_hint(value)
+        return super().apply_hint(value, src)
 
 
 class StringHint(TypeHint):
     def __init__(self):
         super().__init__()
 
-    def apply_hint(self, value):
+    def apply_hint(self, value, src):
         return str(value).strip() if value is not None else ""
 
 
@@ -156,21 +156,23 @@ def __init__(self, schema_name: str, required: bool, context: TypeHintContext):
         self.required = required
         super().__init__()
 
-    def apply_hint(self, value):
+    def apply_hint(self, value, src):
         if self.is_array and isinstance(value, str):
             value = [value.strip() for value in value.split(ARRAY_VALUE_DELIMITER)] if value else []
-        self._apply_ref_hint(value)
+        if self.is_array and isinstance(value, list):
+            for item in value:
+                self._apply_ref_hint(item, src)
+        else:
+            self._apply_ref_hint(value, src)
         return value
 
-    def _apply_ref_hint(self, value):
+    def _apply_ref_hint(self, value, src):
         if not value and self.required:
-            raise ValidationProblem(f"Missing required {self.schema_name} reference")
-        if self.is_array:
-            for item in value:
-                if item and not self.context.validate_ref(item_type=self.schema_name, item_ref=item):
-                    raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {item!r}")
+            raise ValidationProblem(f"No required reference (linkTo) value for: {self.schema_name}"
+                                    f"{f' from {src}' if src else ''}")
         if value and not self.context.validate_ref(item_type=self.schema_name, item_ref=value):
-            raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {value!r}")
+            raise ValidationProblem(f"Cannot resolve reference (linkTo) for: {self.schema_name}"
+                                    f"{f'/{value}' if value else ''}{f' from {src}' if src else ''}")
         return value
 
 
@@ -660,13 +662,15 @@ def check_flattened_row(self, row: Dict, *, tab_name: str, row_number: int, prot
         patch_item = copy.deepcopy(prototype)
         for column_number, column_value in enumerate(row.values()):
             parsed_value = ItemTools.parse_item_value(column_value, apply_heuristics=self.apply_heuristics)
-            if len(row) > column_number and (list(row.keys())[column_number] or "").endswith("#"):
+            column_name = (list(row.keys())[column_number] or "") if len(row) > column_number else ""
+            if column_name.endswith("#"):
                 if isinstance(parsed_value, str):
                     parsed_value = [value.strip() for value in parsed_value.split(ARRAY_VALUE_DELIMITER) if value]
             type_hint = type_hints[column_number]
             if type_hint:
                 try:
-                    parsed_value = type_hint.apply_hint(parsed_value)
+                    src = f"{tab_name}{f'.{column_name}' if column_name else ''}"
+                    parsed_value = type_hint.apply_hint(parsed_value, src)
                 except ValidationProblem as e:
                     headers = self.headers_by_tab_name[tab_name]
                     column_name = headers[column_number]
diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py
index 3edd4acc0..64996e0ae 100644
--- a/test/test_bundle_utils.py
+++ b/test/test_bundle_utils.py
@@ -723,8 +723,12 @@ def test_table_checker():
                                        portal_env=mock_ff_env)
                 checker.check_tabs()
             expected_problems = [
-                f"User[0].project: Unable to validate Project reference: {SAMPLE_PROJECT_UUID!r}",
-                f"User[0].user_institution: Unable to validate Institution reference: {SAMPLE_INSTITUTION_UUID!r}"
+                # f"User[0].project: Unable to validate Project reference: {SAMPLE_PROJECT_UUID!r}",
+                # f"User[0].user_institution: Unable to validate Institution reference: {SAMPLE_INSTITUTION_UUID!r}"
+                # "User[0].project: Cannot resolve reference (linkTo) for: Project/dac6d5b3-6ef6-4271-9715-a78329acf846 from User.project",
+                # "User[0].user_institution: Cannot resolve reference (linkTo) for: Institution/87199845-51b5-4352-bdea-583edae4bb6a from User.user_institution"
+                f"User[0].project: Cannot resolve reference (linkTo) for: Project/{SAMPLE_PROJECT_UUID} from User.project",
+                f"User[0].user_institution: Cannot resolve reference (linkTo) for: Institution/{SAMPLE_INSTITUTION_UUID} from User.user_institution"
             ]
             expected_problem_lines = [f"Problem: {problem}" for problem in expected_problems]
             assert exc.value.problems == expected_problems

From b5ecf1ab86c035c4058d5f0b182aa6a62180ffac Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Sat, 18 Nov 2023 09:29:51 -0500
Subject: [PATCH 25/49] flake8 fixes

---
 test/test_bundle_utils.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py
index 64996e0ae..008442430 100644
--- a/test/test_bundle_utils.py
+++ b/test/test_bundle_utils.py
@@ -723,12 +723,10 @@ def test_table_checker():
                                        portal_env=mock_ff_env)
                 checker.check_tabs()
             expected_problems = [
-                # f"User[0].project: Unable to validate Project reference: {SAMPLE_PROJECT_UUID!r}",
-                # f"User[0].user_institution: Unable to validate Institution reference: {SAMPLE_INSTITUTION_UUID!r}"
-                # "User[0].project: Cannot resolve reference (linkTo) for: Project/dac6d5b3-6ef6-4271-9715-a78329acf846 from User.project",
-                # "User[0].user_institution: Cannot resolve reference (linkTo) for: Institution/87199845-51b5-4352-bdea-583edae4bb6a from User.user_institution"
-                f"User[0].project: Cannot resolve reference (linkTo) for: Project/{SAMPLE_PROJECT_UUID} from User.project",
-                f"User[0].user_institution: Cannot resolve reference (linkTo) for: Institution/{SAMPLE_INSTITUTION_UUID} from User.user_institution"
+                f"User[0].project: Cannot resolve reference (linkTo) for:"
+                f" Project/{SAMPLE_PROJECT_UUID} from User.project",
+                f"User[0].user_institution: Cannot resolve reference (linkTo) for:"
+                f" Institution/{SAMPLE_INSTITUTION_UUID} from User.user_institution"
             ]
             expected_problem_lines = [f"Problem: {problem}" for problem in expected_problems]
             assert exc.value.problems == expected_problems

From 72057e11001bcc6aaf6ff8741691b292b853b324 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Sat, 18 Nov 2023 22:59:55 -0500
Subject: [PATCH 26/49] Added split_string to dcicutils.

---
 dcicutils/misc_utils.py | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py
index 97a17c21b..99aa0b4c6 100644
--- a/dcicutils/misc_utils.py
+++ b/dcicutils/misc_utils.py
@@ -23,7 +23,7 @@
 from collections import defaultdict
 from datetime import datetime as datetime_type
 from dateutil.parser import parse as dateutil_parse
-from typing import Optional
+from typing import List, Optional
 
 
 # Is this the right place for this? I feel like this should be done in an application, not a library.
@@ -1404,6 +1404,30 @@ def string_list(s):
     return [p for p in [part.strip() for part in s.split(",")] if p]
 
 
+def split_string(value: str, delimiter: str, escape: Optional[str] = None) -> List[str]:
+    """
+    Splits the given string into an array of string based on the given delimiter, and an optional escape character.
+    """
+    if not isinstance(value, str) or not (value := value.strip()):
+        return []
+    if not isinstance(escape, str) or not escape:
+        return [item.strip() for item in value.split(delimiter)]
+    result = []
+    item = r""
+    escaped = False
+    for c in value:
+        if c == delimiter and not escaped:
+            result.append(item.strip())
+            item = r""
+        elif c == escape and not escaped:
+            escaped = True
+        else:
+            item += c
+            escaped = False
+    result.append(item.strip())
+    return [item for item in result if item]
+
+
 def is_c4_arn(arn: str) -> bool:
     """
     Returns True iff the given (presumed) AWS ARN string value looks like it

From 4b6f527d363ab82bf4cec22107588db6ddda2a08 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Sat, 18 Nov 2023 23:07:43 -0500
Subject: [PATCH 27/49] Added split_string to dcicutils.

---
 test/test_misc_utils.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/test/test_misc_utils.py b/test/test_misc_utils.py
index b32a6d4ce..f2c9ebeed 100644
--- a/test/test_misc_utils.py
+++ b/test/test_misc_utils.py
@@ -31,7 +31,7 @@
     ObsoleteError, CycleError, TopologicalSorter, keys_and_values_to_dict, dict_to_keys_and_values, is_c4_arn,
     deduplicate_list, chunked, parse_in_radix, format_in_radix, managed_property, future_datetime,
     MIN_DATETIME, MIN_DATETIME_UTC, INPUT, builtin_print, map_chunked, to_camel_case, json_file_contents,
-    pad_to, JsonLinesReader,
+    pad_to, JsonLinesReader, split_string
 )
 from dcicutils.qa_utils import (
     Occasionally, ControlledTime, override_environ as qa_override_environ, MockFileSystem, printed_output,
@@ -3590,3 +3590,22 @@ def test_json_lines_reader_lists():
             parsed = [line for line in JsonLinesReader(fp)]
             expected = [item1, item2]
             assert parsed == expected
+
+
+def test_split_array_string():
+    def split_array_string(value: str) -> List[str]:
+        return split_string(value, "|", "\\")
+    assert split_array_string(r"abc|def|ghi") == ["abc", "def", "ghi"]
+    assert split_array_string(r"abc\|def|ghi") == ["abc|def", "ghi"]
+    assert split_array_string(r"abc\\|def|ghi") == ["abc\\", "def", "ghi"]
+    assert split_array_string(r"abc\\\|def|ghi") == ["abc\\|def", "ghi"]
+    assert split_array_string(r"abc\\\|def\|ghi") == ["abc\\|def|ghi"]
+    assert split_array_string(r"abc\\|\\def\|ghi") == ["abc\\", "\\def|ghi"]
+    assert split_array_string(r"abc\\|\\def\|ghi||") == ["abc\\", "\\def|ghi"]
+    assert split_array_string(r"|abcdefghi|") == ["abcdefghi"]
+    assert split_array_string(r"|abcdefghi||xyzzy") == ["abcdefghi", "xyzzy"]
+    assert split_array_string(r"") == []
+    assert split_array_string(None) == []
+    assert split_array_string(r"|") == []
+    assert split_array_string(r"\|") == ["|"]
+    assert split_array_string(r"\\|") == ["\\"]

From 17d1d4d6014ba0503131857f582653bfad4082ef Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Sat, 18 Nov 2023 23:14:18 -0500
Subject: [PATCH 28/49] Added merge_objects to misc_utils.

---
 dcicutils/misc_utils.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py
index 99aa0b4c6..4582f9c0a 100644
--- a/dcicutils/misc_utils.py
+++ b/dcicutils/misc_utils.py
@@ -23,7 +23,7 @@
 from collections import defaultdict
 from datetime import datetime as datetime_type
 from dateutil.parser import parse as dateutil_parse
-from typing import List, Optional
+from typing import Any, List, Optional, Union
 
 
 # Is this the right place for this? I feel like this should be done in an application, not a library.
@@ -2074,6 +2074,21 @@ def merge_key_value_dict_lists(x, y):
     return [key_value_dict(k, v) for k, v in merged.items()]
 
 
+def merge_objects(target: Union[dict, List[Any]], source: Union[dict, List[Any]]) -> dict:
+    if isinstance(target, dict) and isinstance(source, dict) and source:
+        for key, value in source.items():
+            target[key] = merge_objects(target[key], value) if key in target else value
+    elif isinstance(target, list) and isinstance(source, list) and source:
+        for i in range(max(len(source), len(target))):
+            if i < len(target):
+                target[i] = merge_objects(target[i], source[i] if i < len(source) else source[len(source) - 1])
+            else:
+                target.append(source[i])
+    elif source:
+        target = source
+    return target
+
+
 # Stealing topological sort classes below from python's graphlib module introduced
 # in v3.9 with minor refactoring.
 # Source: https://github.com/python/cpython/blob/3.11/Lib/graphlib.py

From 5aaaaed69f98c899ba501aa8c82538c7a907f6b9 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Sat, 18 Nov 2023 23:23:37 -0500
Subject: [PATCH 29/49] Added remove_empty_properties to misc_utils.

---
 dcicutils/bundle_utils.py | 14 +-------------
 dcicutils/misc_utils.py   | 12 ++++++++++++
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py
index de42c4280..0f9b73e1e 100644
--- a/dcicutils/bundle_utils.py
+++ b/dcicutils/bundle_utils.py
@@ -4,7 +4,7 @@
 from .common import AnyJsonData
 from .env_utils import EnvUtils, public_env_name
 from .ff_utils import get_metadata
-from .misc_utils import AbstractVirtualApp, ignored, ignorable, PRINT, to_camel_case
+from .misc_utils import AbstractVirtualApp, ignored, ignorable, PRINT, remove_empty_properties, to_camel_case
 from .sheet_utils import (
     LoadTableError, prefer_number, TabbedJsonSchemas,
     Header, Headers, TabbedHeaders, ParsedHeader, ParsedHeaders, TabbedParsedHeaders, SheetCellValue, TabbedSheetData,
@@ -752,15 +752,3 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional
                                                  override_schemas=override_schemas)
         return checked_items, problems
     return checked_items
-
-
-def remove_empty_properties(data: Optional[Union[list, dict]]) -> None:
-    if isinstance(data, dict):
-        for key in list(data.keys()):
-            if (value := data[key]) in [None, "", {}, []]:
-                del data[key]
-            else:
-                remove_empty_properties(value)
-    elif isinstance(data, list):
-        for item in data:
-            remove_empty_properties(item)
diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py
index 4582f9c0a..40b4d1604 100644
--- a/dcicutils/misc_utils.py
+++ b/dcicutils/misc_utils.py
@@ -1107,6 +1107,18 @@ def remove_suffix(suffix: str, text: str, required: bool = False):
     return text[:len(text)-len(suffix)]
 
 
+def remove_empty_properties(data: Optional[Union[list, dict]]) -> None:
+    if isinstance(data, dict):
+        for key in list(data.keys()):
+            if (value := data[key]) in [None, "", {}, []]:
+                del data[key]
+            else:
+                remove_empty_properties(value)
+    elif isinstance(data, list):
+        for item in data:
+            remove_empty_properties(item)
+
+
 class ObsoleteError(Exception):
     pass
 

From 7fd4037e750d712e57778aff802b27a08f8e07c7 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Sat, 18 Nov 2023 23:38:42 -0500
Subject: [PATCH 30/49] Added zip_utils.

---
 dcicutils/zip_utils.py | 78 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 dcicutils/zip_utils.py

diff --git a/dcicutils/zip_utils.py b/dcicutils/zip_utils.py
new file mode 100644
index 000000000..9324d1e2c
--- /dev/null
+++ b/dcicutils/zip_utils.py
@@ -0,0 +1,78 @@
+from contextlib import contextmanager
+import gzip
+import os
+import shutil
+import tarfile
+import tempfile
+from typing import List, Optional, Union
+import zipfile
+
+
+@contextmanager
+def unpack_zip_file_to_temporary_directory(file: str) -> str:
+    with temporary_directory() as tmp_directory_name:
+        with zipfile.ZipFile(file, "r") as zipf:
+            zipf.extractall(tmp_directory_name)
+        yield tmp_directory_name
+
+
+@contextmanager
+def unpack_tar_file_to_temporary_directory(file: str) -> str:
+    with temporary_directory() as tmp_directory_name:
+        with tarfile.open(file, "r") as tarf:
+            tarf.extractall(tmp_directory_name)
+        yield tmp_directory_name
+
+
+def unpack_files(file: str, suffixes: Optional[List[str]] = None) -> Optional[str]:
+    unpack_file_to_tmp_directory = {
+        ".tar": unpack_tar_file_to_temporary_directory,
+        ".zip": unpack_zip_file_to_temporary_directory
+    }.get(file[dot:]) if (dot := file.rfind(".")) > 0 else None
+    if unpack_file_to_tmp_directory is not None:
+        with unpack_file_to_tmp_directory(file) as tmp_directory_name:
+            for directory, _, files in os.walk(tmp_directory_name):  # Ignore "." prefixed files.
+                for file in [file for file in files if not file.startswith(".")]:
+                    if not suffixes or any(file.endswith(suffix) for suffix in suffixes):
+                        yield os.path.join(directory, file)
+
+
+@contextmanager
+def unpack_gz_file_to_temporary_file(file: str, suffix: Optional[str] = None) -> str:
+    if (gz := file.endswith(".gz")) or file.endswith(".tgz"):  # The .tgz suffix is simply short for .tar.gz.
+        with temporary_file(name=os.path.basename(file[:-3] if gz else file[:-4] + ".tar")) as tmp_file_name:
+            with open(tmp_file_name, "wb") as outputf:
+                with gzip.open(file, "rb") as inputf:
+                    outputf.write(inputf.read())
+                    outputf.close()
+                    yield tmp_file_name
+
+
+@contextmanager
+def temporary_directory() -> str:
+    try:
+        with tempfile.TemporaryDirectory() as tmp_directory_name:
+            yield tmp_directory_name
+    finally:
+        remove_temporary_directory(tmp_directory_name)
+
+
+@contextmanager
+def temporary_file(name: Optional[str] = None, suffix: Optional[str] = None,
+                   content: Optional[Union[str, List[str]]] = None) -> str:
+    with temporary_directory() as tmp_directory_name:
+        tmp_file_name = os.path.join(tmp_directory_name, name or tempfile.mktemp(dir="")) + (suffix or "")
+        with open(tmp_file_name, "w") as tmp_file:
+            tmp_file.write("\n".join(content) if isinstance(content, list) else str(content))
+        yield tmp_file_name
+
+
+def remove_temporary_directory(tmp_directory_name: str) -> None:
+    def is_temporary_directory(path: str) -> bool:
+        try:
+            tmpdir = tempfile.gettempdir()
+            return os.path.commonpath([path, tmpdir]) == tmpdir and os.path.exists(path) and os.path.isdir(path)
+        except Exception:
+            return False
+    if is_temporary_directory(tmp_directory_name):  # Guard against errant deletion.
+        shutil.rmtree(tmp_directory_name)

From 9c5712c60c9b4c4a9ab8cbd680de5a394fc8159e Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Sun, 19 Nov 2023 10:56:47 -0500
Subject: [PATCH 31/49] Minor fix to temporary_file in zip_utils.

---
 dcicutils/zip_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/dcicutils/zip_utils.py b/dcicutils/zip_utils.py
index 9324d1e2c..56ad9bbbf 100644
--- a/dcicutils/zip_utils.py
+++ b/dcicutils/zip_utils.py
@@ -62,8 +62,9 @@ def temporary_file(name: Optional[str] = None, suffix: Optional[str] = None,
                    content: Optional[Union[str, List[str]]] = None) -> str:
     with temporary_directory() as tmp_directory_name:
         tmp_file_name = os.path.join(tmp_directory_name, name or tempfile.mktemp(dir="")) + (suffix or "")
-        with open(tmp_file_name, "w") as tmp_file:
-            tmp_file.write("\n".join(content) if isinstance(content, list) else str(content))
+        if content is not None:
+            with open(tmp_file_name, "wb" if isinstance(content, bytes) else "w") as tmp_file:
+                tmp_file.write("\n".join(content) if isinstance(content, list) else content)
         yield tmp_file_name
 
 

From ff2a6073aaf9969f40fe679cf5dd52349385d531 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Sun, 19 Nov 2023 11:14:57 -0500
Subject: [PATCH 32/49] Added right_trim_tuple to misc_utils.

---
 dcicutils/misc_utils.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py
index 40b4d1604..f4be0b885 100644
--- a/dcicutils/misc_utils.py
+++ b/dcicutils/misc_utils.py
@@ -23,7 +23,7 @@
 from collections import defaultdict
 from datetime import datetime as datetime_type
 from dateutil.parser import parse as dateutil_parse
-from typing import Any, List, Optional, Union
+from typing import Any, List, Optional, Tuple, Union
 
 
 # Is this the right place for this? I feel like this should be done in an application, not a library.
@@ -1440,6 +1440,13 @@ def split_string(value: str, delimiter: str, escape: Optional[str] = None) -> Li
     return [item for item in result if item]
 
 
+def right_trim_tuple(t: Tuple[Any]) -> Tuple[Any]:
+    i = len(t) - 1
+    while i >= 0 and t[i] is None:
+        i -= 1
+    return t[:i + 1]
+
+
 def is_c4_arn(arn: str) -> bool:
     """
     Returns True iff the given (presumed) AWS ARN string value looks like it

From bc27b136ac5333abd81721e5ca00ca2d70c3316e Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Sun, 19 Nov 2023 12:53:10 -0500
Subject: [PATCH 33/49] Added right_trim_tuple to misc_utils.

---
 dcicutils/zip_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dcicutils/zip_utils.py b/dcicutils/zip_utils.py
index 56ad9bbbf..e98e6ac70 100644
--- a/dcicutils/zip_utils.py
+++ b/dcicutils/zip_utils.py
@@ -59,7 +59,7 @@ def temporary_directory() -> str:
 
 @contextmanager
 def temporary_file(name: Optional[str] = None, suffix: Optional[str] = None,
-                   content: Optional[Union[str, List[str]]] = None) -> str:
+                   content: Optional[Union[str, bytes, List[str]]] = None) -> str:
     with temporary_directory() as tmp_directory_name:
         tmp_file_name = os.path.join(tmp_directory_name, name or tempfile.mktemp(dir="")) + (suffix or "")
         if content is not None:

From 9b0df7af9e6a7a5cb67e199320be57895b489f55 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Mon, 20 Nov 2023 07:42:42 -0500
Subject: [PATCH 34/49] Added right_trim_tuple to misc_utils.

---
 dcicutils/misc_utils.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py
index f4be0b885..1893b3d1d 100644
--- a/dcicutils/misc_utils.py
+++ b/dcicutils/misc_utils.py
@@ -1440,11 +1440,15 @@ def split_string(value: str, delimiter: str, escape: Optional[str] = None) -> Li
     return [item for item in result if item]
 
 
-def right_trim_tuple(t: Tuple[Any]) -> Tuple[Any]:
-    i = len(t) - 1
-    while i >= 0 and t[i] is None:
+def right_trim(list_or_tuple: Union[List[Any], Tuple[Any]]) -> Union[List[Any], Tuple[Any]]:
+    """
+    Removes training None (or emptry string) values from the give tuple or list arnd returns;
+    does NOT change the given value.
+    """
+    i = len(list_or_tuple) - 1
+    while i >= 0 and list_or_tuple[i] in (None, ""):
         i -= 1
-    return t[:i + 1]
+    return list_or_tuple[:i + 1]
 
 
 def is_c4_arn(arn: str) -> bool:
@@ -2093,14 +2097,21 @@ def merge_key_value_dict_lists(x, y):
     return [key_value_dict(k, v) for k, v in merged.items()]
 
 
-def merge_objects(target: Union[dict, List[Any]], source: Union[dict, List[Any]]) -> dict:
+def merge_objects(target: Union[dict, List[Any]], source: Union[dict, List[Any]], full: bool = False) -> dict:
+    """
+    Merges the given source dictionary or list into the target dictionary or list.
+    This DOES change the given target (dictionary or list) IN PLACE.
+    """
     if isinstance(target, dict) and isinstance(source, dict) and source:
         for key, value in source.items():
-            target[key] = merge_objects(target[key], value) if key in target else value
+            target[key] = merge_objects(target[key], value, full) if key in target else value
     elif isinstance(target, list) and isinstance(source, list) and source:
         for i in range(max(len(source), len(target))):
             if i < len(target):
-                target[i] = merge_objects(target[i], source[i] if i < len(source) else source[len(source) - 1])
+                if i < len(source):
+                    target[i] = merge_objects(target[i], source[i], full)
+                elif full:
+                    target[i] = merge_objects(target[i], source[len(source) - 1], full)
             else:
                 target.append(source[i])
     elif source:

From 4d407981605d810ba7fa6e19168c25f5a0fb5dc4 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Mon, 20 Nov 2023 11:38:21 -0500
Subject: [PATCH 35/49] Added to_float and to_integer to misc_utils.

---
 dcicutils/misc_utils.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py
index 1893b3d1d..1c8effead 100644
--- a/dcicutils/misc_utils.py
+++ b/dcicutils/misc_utils.py
@@ -978,6 +978,20 @@ def str_to_bool(x: Optional[str]) -> Optional[bool]:
         raise ValueError(f"An argument to str_or_bool must be a string or None: {x!r}")
 
 
+def to_integer(value: str, fallback: Optional[Any] = None) -> Optional[Any]:
+    try:
+        return int(value)
+    except Exception:
+        return fallback
+
+
+def to_float(value: str, fallback: Optional[Any] = None) -> Optional[Any]:
+    try:
+        return float(value)
+    except Exception:
+        return fallback
+
+
 @contextlib.contextmanager
 def override_environ(**overrides):
     """

From 471f05b18eecc34e002b5e3d8fd935b534598005 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Mon, 20 Nov 2023 12:21:33 -0500
Subject: [PATCH 36/49] Added to_boolean to misc_utils

---
 dcicutils/misc_utils.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py
index 1c8effead..8076ea40a 100644
--- a/dcicutils/misc_utils.py
+++ b/dcicutils/misc_utils.py
@@ -992,6 +992,15 @@ def to_float(value: str, fallback: Optional[Any] = None) -> Optional[Any]:
         return fallback
 
 
+def to_boolean(value: str, fallback: Optional[Any]) -> Optional[Any]:
+    if isinstance(value, str) and (value := value.strip().lower()):
+        if (lower_value := value.lower()) in ["true", "t"]:
+            return True
+        elif lower_value in ["false", "f"]:
+            return False
+    return fallback
+
+
 @contextlib.contextmanager
 def override_environ(**overrides):
     """

From a7e50d1dade36d0c69d9faf30c50e0a8ab70808e Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Mon, 20 Nov 2023 12:35:54 -0500
Subject: [PATCH 37/49] Added to_enum to misc_utils

---
 dcicutils/misc_utils.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py
index 8076ea40a..ba10a9ea0 100644
--- a/dcicutils/misc_utils.py
+++ b/dcicutils/misc_utils.py
@@ -1001,6 +1001,20 @@ def to_boolean(value: str, fallback: Optional[Any]) -> Optional[Any]:
     return fallback
 
 
+def to_enum(value: str, enumerators: List[str]) -> Optional[str]:
+    matches = []
+    if isinstance(value, str) and (value := value.strip()) and isinstance(enumerators, List):
+        enum_specifiers = {str(enum).lower(): enum for enum in enumerators}
+        if (enum_value := enum_specifiers.get(lower_value := value.lower())) is not None:
+            return enum_value
+        for enum_canonical, _ in enum_specifiers.items():
+            if enum_canonical.startswith(lower_value):
+                matches.append(enum_canonical)
+        if len(matches) == 1:
+            return enum_specifiers[matches[0]]
+    return enum_specifiers[matches[0]] if len(matches) == 1 else value
+
+
 @contextlib.contextmanager
 def override_environ(**overrides):
     """

From 8c945ba9bd75654f01dcc19c72dcae4eacc8a964 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Mon, 20 Nov 2023 14:14:52 -0500
Subject: [PATCH 38/49] Updated rst doc file with zip_utils.

---
 docs/source/dcicutils.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/source/dcicutils.rst b/docs/source/dcicutils.rst
index 2d5b8058c..3f10ef95f 100644
--- a/docs/source/dcicutils.rst
+++ b/docs/source/dcicutils.rst
@@ -335,3 +335,10 @@ variant_utils
 
 .. automodule:: dcicutils.variant_utils
   :members:
+
+
+zip_utils
+^^^^^^^^^
+
+.. automodule:: dcicutils.zip_utils
+   :members:

From c303955f19e64b052f761b0fe9827465e21b419e Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Wed, 22 Nov 2023 10:57:24 -0500
Subject: [PATCH 39/49] Added data_readers.

---
 dcicutils/data_readers.py | 138 ++++++++++++++++++++++++++++++++++++++
 dcicutils/misc_utils.py   |   9 +++
 2 files changed, 147 insertions(+)
 create mode 100644 dcicutils/data_readers.py

diff --git a/dcicutils/data_readers.py b/dcicutils/data_readers.py
new file mode 100644
index 000000000..5b525d253
--- /dev/null
+++ b/dcicutils/data_readers.py
@@ -0,0 +1,138 @@
+import abc
+import csv
+import openpyxl
+from typing import Any, Generator, Iterator, List, Optional, Tuple, Union
+from dcicutils.misc_utils import right_trim
+
+
+class RowReader(abc.ABC):  # These readers may evenutally go into dcicutils.
+
+    def __init__(self):
+        self.header = None
+        self.location = 0
+        self._warning_empty_headers = False
+        self._warning_extra_values = []  # Line numbers.
+        self.open()
+
+    def __iter__(self) -> Iterator:
+        for row in self.rows:
+            self.location += 1
+            if self.is_terminating_row(row):
+                break
+            if len(self.header) < len(row):  # Row values beyond what there are headers for are ignored.
+                self._warning_extra_values.append(self.location)
+            yield {column: self.cell_value(value) for column, value in zip(self.header, row)}
+
+    def _define_header(self, header: List[Optional[Any]]) -> None:
+        self.header = []
+        for index, column in enumerate(header or []):
+            if not (column := str(column).strip() if column is not None else ""):
+                self._warning_empty_headers = True
+                break  # Empty header column signals end of header.
+            self.header.append(column)
+
+    def rows(self) -> Generator[Union[List[Optional[Any]], Tuple[Optional[Any], ...]], None, None]:
+        yield
+
+    def is_terminating_row(self, row: Union[List[Optional[Any]], Tuple[Optional[Any]]]) -> bool:
+        return False
+
+    def cell_value(self, value: Optional[Any]) -> Optional[Any]:
+        return str(value).strip() if value is not None else ""
+
+    def open(self) -> None:
+        pass
+
+    @property
+    def issues(self) -> Optional[List[str]]:
+        issues = []
+        if self._warning_empty_headers:
+            issues.append("Empty header column encountered; ignoring it and all subsequent columns.")
+        if self._warning_extra_values:
+            issues.extend([f"Extra column values on row [{row_number}]" for row_number in self._warning_extra_values])
+        return issues if issues else None
+
+
+class ListReader(RowReader):
+
+    def __init__(self, rows: List[List[Optional[Any]]]) -> None:
+        self._rows = rows
+        super().__init__()
+
+    @property
+    def rows(self) -> Generator[List[Optional[Any]], None, None]:
+        for row in self._rows[1:]:
+            yield row
+
+    def open(self) -> None:
+        if not self.header:
+            self._define_header(self._rows[0] if self._rows else [])
+
+
+class CsvReader(RowReader):
+
+    def __init__(self, file: str) -> None:
+        self._file = file
+        self._file_handle = None
+        self._rows = None
+        super().__init__()
+
+    @property
+    def rows(self) -> Generator[List[Optional[Any]], None, None]:
+        for row in self._rows:
+            yield right_trim(row)
+
+    def open(self) -> None:
+        if self._file_handle is None:
+            self._file_handle = open(self._file)
+            self._rows = csv.reader(self._file_handle, delimiter="\t" if self._file.endswith(".tsv") else ",")
+            self._define_header(right_trim(next(self._rows, [])))
+
+    def __del__(self) -> None:
+        if (file_handle := self._file_handle) is not None:
+            self._file_handle = None
+            file_handle.close()
+
+
+class ExcelSheetReader(RowReader):
+
+    def __init__(self, sheet_name: str, workbook: openpyxl.workbook.workbook.Workbook) -> None:
+        self.sheet_name = sheet_name or "Sheet1"
+        self._workbook = workbook
+        self._rows = None
+        super().__init__()
+
+    @property
+    def rows(self) -> Generator[Tuple[Optional[Any], ...], None, None]:
+        for row in self._rows(min_row=2, values_only=True):
+            yield right_trim(row)
+
+    def is_terminating_row(self, row: Tuple[Optional[Any]]) -> bool:
+        return all(cell is None for cell in row)  # Empty row signals end of data.
+
+    def open(self) -> None:
+        if not self._rows:
+            self._rows = self._workbook[self.sheet_name].iter_rows
+            self._define_header(right_trim(next(self._rows(min_row=1, max_row=1, values_only=True), [])))
+
+
+class Excel:
+
+    def __init__(self, file: str) -> None:
+        self._file = file
+        self._workbook = None
+        self.sheet_names = None
+        self.open()
+
+    def sheet_reader(self, sheet_name: str) -> ExcelSheetReader:
+        return ExcelSheetReader(sheet_name=sheet_name, workbook=self._workbook)
+
+    def open(self) -> None:
+        if self._workbook is None:
+            self._workbook = openpyxl.load_workbook(self._file, data_only=True)
+            self.sheet_names = self._workbook.sheetnames or []
+
+    def __del__(self) -> None:
+        if (workbook := self._workbook) is not None:
+            self._workbook = None
+            workbook.close()
diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py
index ba10a9ea0..83a3094e9 100644
--- a/dcicutils/misc_utils.py
+++ b/dcicutils/misc_utils.py
@@ -1369,6 +1369,15 @@ def json_file_contents(filename):
         return json.load(fp)
 
 
+def load_json_if(s: str, is_array: bool = False, is_object: bool = False) -> Optional[Any]:
+    if ((is_object and s.startswith("{") and s.endswith("}")) or (is_array and s.startswith("[") and s.endswith("]"))):
+        try:
+            return json.loads(s)
+        except Exception:
+            pass
+    return s
+
+
 def camel_case_to_snake_case(s, separator='_'):
     """
     Converts CamelCase to snake_case.

From eb9ccf36071e227e7d3e1fd6ff933efb8611a444 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Wed, 22 Nov 2023 11:17:10 -0500
Subject: [PATCH 40/49] Comments

---
 dcicutils/data_readers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dcicutils/data_readers.py b/dcicutils/data_readers.py
index 5b525d253..48a0d0c01 100644
--- a/dcicutils/data_readers.py
+++ b/dcicutils/data_readers.py
@@ -5,7 +5,7 @@
 from dcicutils.misc_utils import right_trim
 
 
-class RowReader(abc.ABC):  # These readers may evenutally go into dcicutils.
+class RowReader(abc.ABC):
 
     def __init__(self):
         self.header = None

From 7037476ea34d96b973a0286faa56db5d50b0b198 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Wed, 22 Nov 2023 13:45:24 -0500
Subject: [PATCH 41/49] Minor update to merge_objects.

---
 dcicutils/misc_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py
index 83a3094e9..0b93f79c5 100644
--- a/dcicutils/misc_utils.py
+++ b/dcicutils/misc_utils.py
@@ -2146,8 +2146,10 @@ def merge_key_value_dict_lists(x, y):
 def merge_objects(target: Union[dict, List[Any]], source: Union[dict, List[Any]], full: bool = False) -> dict:
     """
     Merges the given source dictionary or list into the target dictionary or list.
-    This DOES change the given target (dictionary or list) IN PLACE.
+    This MAY change the given target (dictionary or list) IN PLACE.
     """
+    if target is None:
+        return source
     if isinstance(target, dict) and isinstance(source, dict) and source:
         for key, value in source.items():
             target[key] = merge_objects(target[key], value, full) if key in target else value

From 19612b15fb02e8e332a90139777433e3ed032560 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Mon, 27 Nov 2023 11:42:56 -0500
Subject: [PATCH 42/49] Minor updates to Excel class in data_readers.

---
 dcicutils/data_readers.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/dcicutils/data_readers.py b/dcicutils/data_readers.py
index 48a0d0c01..46a07a059 100644
--- a/dcicutils/data_readers.py
+++ b/dcicutils/data_readers.py
@@ -1,7 +1,7 @@
 import abc
 import csv
 import openpyxl
-from typing import Any, Generator, Iterator, List, Optional, Tuple, Union
+from typing import Any, Generator, Iterator, List, Optional, Type, Tuple, Union
 from dcicutils.misc_utils import right_trim
 
 
@@ -17,6 +17,8 @@ def __init__(self):
     def __iter__(self) -> Iterator:
         for row in self.rows:
             self.location += 1
+            if self.is_comment_row(row):
+                continue
             if self.is_terminating_row(row):
                 break
             if len(self.header) < len(row):  # Row values beyond what there are headers for are ignored.
@@ -34,6 +36,9 @@ def _define_header(self, header: List[Optional[Any]]) -> None:
     def rows(self) -> Generator[Union[List[Optional[Any]], Tuple[Optional[Any], ...]], None, None]:
         yield
 
+    def is_comment_row(self, row: Union[List[Optional[Any]], Tuple[Optional[Any]]]) -> bool:
+        return False
+
     def is_terminating_row(self, row: Union[List[Optional[Any]], Tuple[Optional[Any]]]) -> bool:
         return False
 
@@ -118,14 +123,18 @@ def open(self) -> None:
 
 class Excel:
 
-    def __init__(self, file: str) -> None:
+    def __init__(self, file: str, reader_class: Optional[Type] = None) -> None:
         self._file = file
         self._workbook = None
         self.sheet_names = None
+        if isinstance(reader_class, Type) and issubclass(reader_class, ExcelSheetReader):
+            self._reader_class = reader_class
+        else:
+            self._reader_class = ExcelSheetReader
         self.open()
 
     def sheet_reader(self, sheet_name: str) -> ExcelSheetReader:
-        return ExcelSheetReader(sheet_name=sheet_name, workbook=self._workbook)
+        return self._reader_class(sheet_name=sheet_name, workbook=self._workbook)
 
     def open(self) -> None:
         if self._workbook is None:

From cd0f51290a8e096a4d32b1b87171a84aca49aefd Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Mon, 27 Nov 2023 11:49:05 -0500
Subject: [PATCH 43/49] Updated dcicutils.rst

---
 docs/source/dcicutils.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/source/dcicutils.rst b/docs/source/dcicutils.rst
index 3f10ef95f..f8b902536 100644
--- a/docs/source/dcicutils.rst
+++ b/docs/source/dcicutils.rst
@@ -79,6 +79,13 @@ cloudformation_utils
    :members:
 
 
+data_readers
+^^^^^^^^^^
+
+.. automodule:: dcicutils.data_readers
+   :members:
+
+
 data_utils
 ^^^^^^^^^^
 

From 843b4e0a60cea54e9d02d1f07813f1a7972ae959 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Mon, 27 Nov 2023 13:23:27 -0500
Subject: [PATCH 44/49] Minor update to misc_utils.load_json_if

---
 dcicutils/misc_utils.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py
index 0b93f79c5..6f8100d51 100644
--- a/dcicutils/misc_utils.py
+++ b/dcicutils/misc_utils.py
@@ -1369,13 +1369,16 @@ def json_file_contents(filename):
         return json.load(fp)
 
 
-def load_json_if(s: str, is_array: bool = False, is_object: bool = False) -> Optional[Any]:
-    if ((is_object and s.startswith("{") and s.endswith("}")) or (is_array and s.startswith("[") and s.endswith("]"))):
+def load_json_if(s: str, is_array: bool = False, is_object: bool = False,
+                 fallback: Optional[Any] = None) -> Optional[Any]:
+    if (isinstance(s, str) and
+        ((is_object and s.startswith("{") and s.endswith("}")) or
+         (is_array and s.startswith("[") and s.endswith("]")))):
         try:
             return json.loads(s)
         except Exception:
             pass
-    return s
+    return fallback
 
 
 def camel_case_to_snake_case(s, separator='_'):

From 5db09855c5bb8424846918fcb37e6a5579fc1612 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Mon, 27 Nov 2023 15:40:18 -0500
Subject: [PATCH 45/49] Added test for merge_objects

---
 test/test_misc_utils.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/test/test_misc_utils.py b/test/test_misc_utils.py
index f2c9ebeed..52bed8f70 100644
--- a/test/test_misc_utils.py
+++ b/test/test_misc_utils.py
@@ -31,7 +31,7 @@
     ObsoleteError, CycleError, TopologicalSorter, keys_and_values_to_dict, dict_to_keys_and_values, is_c4_arn,
     deduplicate_list, chunked, parse_in_radix, format_in_radix, managed_property, future_datetime,
     MIN_DATETIME, MIN_DATETIME_UTC, INPUT, builtin_print, map_chunked, to_camel_case, json_file_contents,
-    pad_to, JsonLinesReader, split_string
+    pad_to, JsonLinesReader, split_string, merge_objects
 )
 from dcicutils.qa_utils import (
     Occasionally, ControlledTime, override_environ as qa_override_environ, MockFileSystem, printed_output,
@@ -3609,3 +3609,19 @@ def split_array_string(value: str) -> List[str]:
     assert split_array_string(r"|") == []
     assert split_array_string(r"\|") == ["|"]
     assert split_array_string(r"\\|") == ["\\"]
+
+
+def test_merge_objects_1():
+    target = {"abc": {"def": {"ghi": None}}, "xyzzy": [None, None, None]}
+    source = {"xyzzy": [{"foo": None}, {"goo": None}]}
+    expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}, None]}
+    merge_objects(target, source, False)
+    assert target == expected
+
+
+def test_merge_objects_2():
+    target = {"abc": {"def": {"ghi": None}}, "xyzzy": [None, None, None]}
+    source = {"xyzzy": [{"foo": None}, {"goo": None}]}
+    expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}, {"goo": None}]}
+    merge_objects(target, source, True)
+    assert target == expected

From 59673ca8d7923fee519dca1d912a8579afb448df Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Mon, 27 Nov 2023 15:54:27 -0500
Subject: [PATCH 46/49] Comments and tests for misc_utils.merge_objects.

---
 dcicutils/misc_utils.py |  4 +++-
 test/test_misc_utils.py | 47 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py
index 6f8100d51..e38f980df 100644
--- a/dcicutils/misc_utils.py
+++ b/dcicutils/misc_utils.py
@@ -2149,7 +2149,9 @@ def merge_key_value_dict_lists(x, y):
 def merge_objects(target: Union[dict, List[Any]], source: Union[dict, List[Any]], full: bool = False) -> dict:
     """
     Merges the given source dictionary or list into the target dictionary or list.
-    This MAY change the given target (dictionary or list) IN PLACE.
+    This MAY well change the given target (dictionary or list) IN PLACE.
+    The the full argument is True then any target lists longer than the
+    source be will be filled out with the last element(s) of the source.
     """
     if target is None:
         return source
diff --git a/test/test_misc_utils.py b/test/test_misc_utils.py
index 52bed8f70..7d737793a 100644
--- a/test/test_misc_utils.py
+++ b/test/test_misc_utils.py
@@ -3625,3 +3625,50 @@ def test_merge_objects_2():
     expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}, {"goo": None}]}
     merge_objects(target, source, True)
     assert target == expected
+
+
+def test_merge_objects_3():
+    target = {"abc": {"def": {"ghi": None}}, "xyzzy": [None, None]}
+    source = {"xyzzy": [{"foo": None}, {"goo": None}]}
+    expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}]}
+    merge_objects(target, source, False)
+    assert target == expected
+
+
+def test_merge_objects_4():
+    target = {"abc": {"def": {"ghi": None}}, "xyzzy": [None, None]}
+    source = {"xyzzy": [{"foo": None}, {"goo": None}]}
+    expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}]}
+    merge_objects(target, source, True)
+    assert target == expected
+
+
+def test_merge_objects_5():
+    target = {"abc": {"def": {"ghi": None}}, "xyzzy": ["mno"]}
+    source = {"xyzzy": [{"foo": None}, {"goo": None}]}
+    expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}]}
+    merge_objects(target, source, False)
+    assert target == expected
+
+def test_merge_objects_6():
+    target = {"abc": {"def": {"ghi": None}}, "xyzzy": ["mno"]}
+    source = {"xyzzy": [{"foo": None}, {"goo": None}]}
+    expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}]}
+    merge_objects(target, source, True)
+    assert target == expected
+
+
+def test_merge_objects_7():
+    target = {"abc": {"def": {"ghi": None}}, "xyzzy": [None, None, "abc", "def", 123]}
+    source = {"xyzzy": [{"foo": None}, {"goo": None}, {"hoo": None}]}
+    expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}, {"hoo": None}, "def", 123]}
+    merge_objects(target, source, False)
+    assert target == expected
+
+
+def test_merge_objects_8():
+    target = {"abc": {"def": {"ghi": None}}, "xyzzy": [None, None, "abc", "def", 123]}
+    source = {"xyzzy": [{"foo": None}, {"goo": None}, {"hoo": None}]}
+    expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}, {"hoo": None}, {"hoo": None}, {"hoo": None}]}
+    merge_objects(target, source, True)
+    assert target == expected

From b27428f4edd371e7c50e25f59a267d304f8f9685 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Mon, 27 Nov 2023 15:55:14 -0500
Subject: [PATCH 47/49] lint fixes

---
 test/test_misc_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/test_misc_utils.py b/test/test_misc_utils.py
index 7d737793a..6bf0afebe 100644
--- a/test/test_misc_utils.py
+++ b/test/test_misc_utils.py
@@ -3650,6 +3650,7 @@ def test_merge_objects_5():
     merge_objects(target, source, False)
     assert target == expected
 
+
 def test_merge_objects_6():
     target = {"abc": {"def": {"ghi": None}}, "xyzzy": ["mno"]}
     source = {"xyzzy": [{"foo": None}, {"goo": None}]}
@@ -3669,6 +3670,7 @@ def test_merge_objects_7():
 def test_merge_objects_8():
     target = {"abc": {"def": {"ghi": None}}, "xyzzy": [None, None, "abc", "def", 123]}
     source = {"xyzzy": [{"foo": None}, {"goo": None}, {"hoo": None}]}
-    expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}, {"hoo": None}, {"hoo": None}, {"hoo": None}]}
+    expected = {"abc": {"def": {"ghi": None}},
+                "xyzzy": [{"foo": None}, {"goo": None}, {"hoo": None}, {"hoo": None}, {"hoo": None}]}
     merge_objects(target, source, True)
     assert target == expected

From c0ba3d4ef7eabf889569b4617add4da442cb21d7 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Thu, 30 Nov 2023 05:52:01 -0500
Subject: [PATCH 48/49] Minor update to misc_utils.right_trim

---
 dcicutils/misc_utils.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py
index e38f980df..0484d6992 100644
--- a/dcicutils/misc_utils.py
+++ b/dcicutils/misc_utils.py
@@ -23,7 +23,7 @@
 from collections import defaultdict
 from datetime import datetime as datetime_type
 from dateutil.parser import parse as dateutil_parse
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, Callable, List, Optional, Tuple, Union
 
 
 # Is this the right place for this? I feel like this should be done in an application, not a library.
@@ -1489,13 +1489,14 @@ def split_string(value: str, delimiter: str, escape: Optional[str] = None) -> Li
     return [item for item in result if item]
 
 
-def right_trim(list_or_tuple: Union[List[Any], Tuple[Any]]) -> Union[List[Any], Tuple[Any]]:
+def right_trim(list_or_tuple: Union[List[Any], Tuple[Any]],
+               remove: Optional[Callable] = None) -> Union[List[Any], Tuple[Any]]:
     """
     Removes training None (or emptry string) values from the give tuple or list arnd returns;
     does NOT change the given value.
     """
     i = len(list_or_tuple) - 1
-    while i >= 0 and list_or_tuple[i] in (None, ""):
+    while i >= 0 and ((remove and remove(list_or_tuple[i])) or (not remove and list_or_tuple[i] in (None, ""))):
         i -= 1
     return list_or_tuple[:i + 1]
 

From 35296b76e886fa87439afadef436a60ad009102a Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Thu, 30 Nov 2023 06:25:56 -0500
Subject: [PATCH 49/49] Update version and CHANGELOG.rst.

---
 CHANGELOG.rst  | 2 +-
 pyproject.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 63c3ab9c3..317a7391b 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -8,7 +8,7 @@ Change Log
 
 8.4.0
 =====
-* More work related to SMaHT ingestion.
+* More work related to SMaHT ingestion (bundle/sheet_utils, data_readers, etc).
 
 
 8.3.0
diff --git a/pyproject.toml b/pyproject.toml
index df6ceec2f..4ae62c250 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dcicutils"
-version = "8.3.0.1b1"  # TODO: To become 8.4.0
+version = "8.4.0"
 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources"
 authors = ["4DN-DCIC Team <support@4dnucleome.org>"]
 license = "MIT"