From ada2c8fadad4abd008d1aa0cf32de5f8bf989ef6 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Tue, 7 Nov 2023 16:14:31 -0500 Subject: [PATCH 01/49] Changes to bundle_utils/etc related to SMaHT ingestion work. --- dcicutils/bundle_utils.py | 42 +++++++++++++++++++++++++++++++++-- dcicutils/validation_utils.py | 18 +++++++++------ 2 files changed, 51 insertions(+), 9 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index fb76c74a5..a55b37e3a 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -15,6 +15,7 @@ PatchPrototype = Dict TabbedPatchPrototypes = Dict[str, PatchPrototype] +ARRAY_VALUE_DELIMITER = "|" class TypeHintContext: @@ -115,6 +116,20 @@ def apply_hint(self, value): return super().apply_hint(value) +class ArrayHint(TypeHint): + def apply_hint(self, value): + if value is None or value == '': + return [] + if isinstance(value, str): + return [value.strip() for value in value.split(ARRAY_VALUE_DELIMITER)] + return super().apply_hint(value) + + +class StringHint(TypeHint): + def apply_hint(self, value): + return str(value).strip() if value is not None else "" + + class RefHint(TypeHint): def __str__(self): @@ -288,8 +303,11 @@ def parse_item_value(cls, value: SheetCellValue, return True elif lvalue == 'false': return False - elif lvalue == 'null' or lvalue == '': + # elif lvalue == 'null' or lvalue == '': + elif lvalue == 'null': return None + elif lvalue == '': + return lvalue elif split_pipe and '|' in value: if value == '|': # Use '|' for [] return [] @@ -307,7 +325,8 @@ def parse_item_value(cls, value: SheetCellValue, @classmethod def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False): - if (value is None or value == '') and not force: + # if (value is None or value == '') and not force: + if value is None and not force: return [key, *more_path] = path if not more_path: @@ -327,10 +346,13 @@ def find_type_hint_for_subschema(cls, subschema: Any, context: Optional[TypeHint link_to = subschema.get('linkTo') if link_to and context.schema_exists(link_to): return RefHint(schema_name=link_to, context=context) + return StringHint() elif t in ('integer', 'float', 'number'): return NumHint(declared_type=t) elif t == 'boolean': return BoolHint() + elif t == 'array': + return ArrayHint() @classmethod def find_type_hint_for_parsed_header(cls, parsed_header: Optional[ParsedHeader], schema: Any, @@ -641,6 +663,7 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional # TODO: validate= is presently False (i.e., disabled) by default while being debugged, # but for production use maybe should not be? -kmp 25-Oct-2023 validate: bool = False, + retain_empty_properties: bool = False, **kwargs): annotated_data = TableSetManager.load_annotated(filename=filename, tab_name=tab_name, escaping=escaping, prefer_number=False, **kwargs) @@ -655,8 +678,23 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional # No fancy checking for things like .json, etc. for now. Only check things that came from # spreadsheet-like data, where structural datatypes are forced into strings. checked_items = tabbed_rows + if not retain_empty_properties: + remove_empty_properties(checked_items) if validate: problems = validate_data_against_schemas(checked_items, portal_env=portal_env, portal_vapp=portal_vapp, override_schemas=override_schemas) return checked_items, problems return checked_items + + +def remove_empty_properties(data: Optional[Union[list,dict]]) -> None: + if isinstance(data, dict): + for key in list(data.keys()): + value = data[key] + if not value: + del data[key] + else: + remove_empty_properties(value) + elif isinstance(data, list): + for item in data: + remove_empty_properties(item) diff --git a/dcicutils/validation_utils.py b/dcicutils/validation_utils.py index 9e80146a0..35014a3cf 100644 --- a/dcicutils/validation_utils.py +++ b/dcicutils/validation_utils.py @@ -87,15 +87,19 @@ def identifying_properties(self, schema: Optional[JsonSchema] = None, schema_nam @classmethod # This operation doesn't actually use the schemas so is safe as a class method def identifying_value(cls, data_item: Dict[str, AnyJsonData], identifying_properties) -> AnyJsonData: - if not identifying_properties: - raise ValueError("No identifying properties were specified.") - for identifying_property in identifying_properties: + for identifying_property in identifying_properties or []: if identifying_property in data_item: return data_item[identifying_property] - raise ValueError(f'{there_are(identifying_properties, just_are=True)}' - f' no {maybe_pluralize(identifying_properties, "identifying property")}' - f' {disjoined_list([repr(x) for x in identifying_properties])}' - f' in {json.dumps(data_item)}.') + if False: + if not identifying_properties: + raise ValueError("No identifying properties were specified.") + for identifying_property in identifying_properties: + if identifying_property in data_item: + return data_item[identifying_property] + raise ValueError(f'{there_are(identifying_properties, just_are=True)}' + f' no {maybe_pluralize(identifying_properties, "identifying property")}' + f' {disjoined_list([repr(x) for x in identifying_properties])}' + f' in {json.dumps(data_item)}.') @staticmethod def get_identifying_properties(schema: dict) -> list: From cc03d5201452cdf72eaa8a54c915b1b26a764f90 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Tue, 7 Nov 2023 16:16:18 -0500 Subject: [PATCH 02/49] Changes to bundle_utils/etc related to SMaHT ingestion work. --- CHANGELOG.rst | 6 ++++++ pyproject.toml | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c50a6d38a..99b3dc103 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,12 @@ dcicutils Change Log ---------- + +8.3.0 +====== +* More work related to SMaHT ingestion. + + 8.2.0 ===== * 2023-11-02 diff --git a/pyproject.toml b/pyproject.toml index 5c3863c3a..122ce5164 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "8.2.0" +version = "8.2.0.1b1" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From 7beaa3d34bcf4d1055c3f8e8c4c1f92e80af1e4a Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 8 Nov 2023 13:10:20 -0500 Subject: [PATCH 03/49] Changes to bundle_utils/etc related to SMaHT ingestion work. --- dcicutils/sheet_utils.py | 92 ++++++++++++++++++++++++++++++----- dcicutils/validation_utils.py | 1 + 2 files changed, 82 insertions(+), 11 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 66ce523fa..8411c3cb2 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -16,8 +16,8 @@ from tempfile import TemporaryFile, TemporaryDirectory from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union from .common import AnyJsonData, Regexp, JsonSchema -from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize # , there_are -from .misc_utils import ignored, pad_to, JsonLinesReader, remove_suffix # , PRINT, AbstractVirtualApp +from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize +from .misc_utils import ignored, pad_to, JsonLinesReader, remove_suffix, to_snake_case Header = str @@ -32,6 +32,7 @@ SheetData = List[dict] TabbedSheetData = Dict[str, SheetData] TabbedJsonSchemas = Dict[str, JsonSchema] +CommentRow = object() class LoadFailure(Exception): @@ -296,12 +297,19 @@ def load_content(self) -> AnyJsonData: state = self._create_tab_processor_state(tab_name) for row_data in self._raw_row_generator_for_tab_name(tab_name): processed_row_data: AnyJsonData = self._process_row(tab_name, state, row_data) + if not processed_row_data: + break + if processed_row_data is CommentRow: + continue sheet_content.append(processed_row_data) - self.content_by_tab_name[tab_name] = sheet_content + normalized_tab_name = to_snake_case(tab_name.replace(" ", "")) + self.content_by_tab_name[normalized_tab_name] = sheet_content return self.content_by_tab_name - def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData: - return prefer_number(value) if self.prefer_number else value + def parse_cell_value(self, value: SheetCellValue, override_prefer_number: Optional[bool] = None) -> AnyJsonData: + if override_prefer_number is None: + override_prefer_number = self.prefer_number + return prefer_number(value) if override_prefer_number else value class TableSetManagerRegistry: @@ -355,8 +363,9 @@ class XlsxManager(FlattenedTableSetManager): """ This implements the mechanism to get a series of rows out of the sheets in an XLSX file. """ - ALLOWED_FILE_EXTENSIONS = ['.xlsx'] + TERMINATE_ON_EMPTY_ROW = True + CONVERT_VALUES_TO_STRING = True @classmethod def _all_rows(cls, sheet: Worksheet): @@ -383,21 +392,66 @@ def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]: for row in self._all_rows(sheet)) def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow: - return [sheet.cell(row=row, column=col).value - for col in self._all_cols(sheet)] + # return [str(sheet.cell(row=row, column=col).value) for col in self._all_cols(sheet)] + results = [] + for col in self._all_cols(sheet): + value = sheet.cell(row=row, column=col).value + if value is not None and XlsxManager.CONVERT_VALUES_TO_STRING: + value = str(value).strip() + results.append(value) + return results def _create_tab_processor_state(self, tab_name: str) -> Headers: sheet = self.reader_agent[tab_name] + """ headers: Headers = [str(sheet.cell(row=1, column=col).value) for col in self._all_cols(sheet)] - self.headers_by_tab_name[sheet.title] = headers - return headers + """ + headers = [] + for col in self._all_cols(sheet): + cell = sheet.cell(row=1, column=col).value + if cell is not None and XlsxManager.CONVERT_VALUES_TO_STRING: + cell = str(cell).strip() + headers.append(cell) + self.headers_by_tab_name[sheet.title] = XlsxManager.remove_trailing_none_values(headers) + return self.headers_by_tab_name[sheet.title] def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: ignored(tab_name) - return {headers[i]: self.parse_cell_value(row_datum) + if XlsxManager.is_terminating_row(row_data): + return None + if XlsxManager.is_comment_row(row_data): + return CommentRow + if len(headers) < len(row_data): + row_data = row_data[:len(headers)] + override_prefer_number = False if XlsxManager.CONVERT_VALUES_TO_STRING else None + return {headers[i]: self.parse_cell_value(row_datum, override_prefer_number=override_prefer_number) for i, row_datum in enumerate(row_data)} + @staticmethod + def is_terminating_row(row: List[Optional[Any]]) -> bool: + # TODO: This is may change; currently an all blank row signals the end of input. + return all(cell is None for cell in row) and XlsxManager.TERMINATE_ON_EMPTY_ROW + + @staticmethod + def is_comment_row(row: Tuple[Optional[Any]]) -> bool: + # TODO: This will probably change; currently a row starting only with #, *, or ^ signals a comment. + for cell in row: + if cell is None: + continue + if (cell := str(cell)).startswith("#") or cell.startswith("*") or cell.startswith("^"): + return True + return False + + @staticmethod + def remove_trailing_none_values(values: list[Any]) -> list[Any]: + for index in range(len(values) - 1, -1, -1): + if values[index] is not None: + break + else: + return [] + return values[:index + 1] + def infer_tab_name_from_filename(filename): return os.path.basename(filename).split('.')[0] @@ -688,6 +742,7 @@ def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[ @classmethod def load_annotated(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, + retain_empty_properties: bool = False, **kwargs) -> Dict: """ Given a filename and various options @@ -697,6 +752,8 @@ def load_annotated(cls, filename: str, tab_name: Optional[str] = None, escaping: manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, **kwargs) content: TabbedSheetData = manager.load_content() + if not retain_empty_properties: + remove_empty_properties(content) return { 'filename': filename, 'content': content, @@ -709,3 +766,16 @@ def load_annotated(cls, filename: str, tab_name: Optional[str] = None, escaping: load_table_set = TableSetManager.load +load_table_annotated = TableSetManager.load_annotated + +def remove_empty_properties(data: Optional[Union[list,dict]]) -> None: + if isinstance(data, dict): + for key in list(data.keys()): + value = data[key] + if not value: + del data[key] + else: + remove_empty_properties(value) + elif isinstance(data, list): + for item in data: + remove_empty_properties(item) diff --git a/dcicutils/validation_utils.py b/dcicutils/validation_utils.py index 35014a3cf..f6612067c 100644 --- a/dcicutils/validation_utils.py +++ b/dcicutils/validation_utils.py @@ -63,6 +63,7 @@ def fetch_schema(self, schema_name: str): return override_schema schema: Optional[AnyJsonData] = self.SCHEMA_CACHE.get(schema_name) if schema is None and schema_name not in self.SCHEMA_CACHE: # If None is already stored, don't look it up again + schema_name = schema_name.replace(" ", "") schema = get_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) self.SCHEMA_CACHE[schema_name] = schema return schema From 3de7611e2a47b6796737e9dff9ca22253b1cb944 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 8 Nov 2023 13:54:49 -0500 Subject: [PATCH 04/49] Changes to bundle_utils/etc related to SMaHT ingestion work. --- dcicutils/sheet_utils.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 8411c3cb2..30fcd88f0 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -302,8 +302,7 @@ def load_content(self) -> AnyJsonData: if processed_row_data is CommentRow: continue sheet_content.append(processed_row_data) - normalized_tab_name = to_snake_case(tab_name.replace(" ", "")) - self.content_by_tab_name[normalized_tab_name] = sheet_content + self.content_by_tab_name[tab_name] = sheet_content return self.content_by_tab_name def parse_cell_value(self, value: SheetCellValue, override_prefer_number: Optional[bool] = None) -> AnyJsonData: @@ -752,8 +751,6 @@ def load_annotated(cls, filename: str, tab_name: Optional[str] = None, escaping: manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, **kwargs) content: TabbedSheetData = manager.load_content() - if not retain_empty_properties: - remove_empty_properties(content) return { 'filename': filename, 'content': content, @@ -767,15 +764,3 @@ def load_annotated(cls, filename: str, tab_name: Optional[str] = None, escaping: load_table_set = TableSetManager.load load_table_annotated = TableSetManager.load_annotated - -def remove_empty_properties(data: Optional[Union[list,dict]]) -> None: - if isinstance(data, dict): - for key in list(data.keys()): - value = data[key] - if not value: - del data[key] - else: - remove_empty_properties(value) - elif isinstance(data, list): - for item in data: - remove_empty_properties(item) From 33b64aece187c8d7361a7f229f024fbaaf84d7a9 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 8 Nov 2023 22:49:01 -0500 Subject: [PATCH 05/49] Changes to bundle_utils/etc related to SMaHT ingestion work. --- dcicutils/bundle_utils.py | 5 ++-- dcicutils/s3_utils.py | 2 +- dcicutils/sheet_utils.py | 2 +- test/data_files/sample_items2.json | 2 +- test/test_bundle_utils.py | 38 +++++++++++++++++++----- test/test_qa_utils.py | 2 +- test/test_sheet_utils.py | 47 ++++++++++++++++++++++++++---- 7 files changed, 78 insertions(+), 20 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index a55b37e3a..04c6607f0 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -687,11 +687,10 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional return checked_items -def remove_empty_properties(data: Optional[Union[list,dict]]) -> None: +def remove_empty_properties(data: Optional[Union[list, dict]]) -> None: if isinstance(data, dict): for key in list(data.keys()): - value = data[key] - if not value: + if (value := data[key]) is None: del data[key] else: remove_empty_properties(value) diff --git a/dcicutils/s3_utils.py b/dcicutils/s3_utils.py index e34a3f2b7..ea1897beb 100644 --- a/dcicutils/s3_utils.py +++ b/dcicutils/s3_utils.py @@ -318,7 +318,7 @@ def get_key(self, keyfile_name: str = 'access_key_admin' SSECustomerKey=os.environ['S3_ENCRYPT_KEY'], SSECustomerAlgorithm='AES256') body_data: Union[bytes, str] = response['Body'].read() - auth_text: str = body_data.decode() if type(body_data) == bytes else body_data + auth_text: str = body_data.decode() if type(body_data) is bytes else body_data try: return json.loads(auth_text) except (ValueError, TypeError): diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 30fcd88f0..370c22bd8 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -17,7 +17,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union from .common import AnyJsonData, Regexp, JsonSchema from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize -from .misc_utils import ignored, pad_to, JsonLinesReader, remove_suffix, to_snake_case +from .misc_utils import ignored, pad_to, JsonLinesReader, remove_suffix Header = str diff --git a/test/data_files/sample_items2.json b/test/data_files/sample_items2.json index 7e084f908..0f3b22f90 100644 --- a/test/data_files/sample_items2.json +++ b/test/data_files/sample_items2.json @@ -1,6 +1,6 @@ [ {"name": "john", "sex": "Male", "member": false}, {"name": "juan", "sex": "Male", "member": true}, - {"name": "igor", "sex": "unknown", "member": null}, + {"name": "igor", "sex": "unknown", "member": ""}, {"name": "mary", "sex": "Female", "member": true} ] diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py index 6e1da6685..c739c8504 100644 --- a/test/test_bundle_utils.py +++ b/test/test_bundle_utils.py @@ -4,6 +4,7 @@ import os import pytest import re +from typing import List, Optional, Union from dcicutils import ( bundle_utils as bundle_utils_module, @@ -172,7 +173,7 @@ def test_item_tools_parse_item_value_basic(): # Nulls (None, None), - ('', None), ('null', None), ('Null', None), ('NULL', None), + ('', ''), ('null', None), ('Null', None), ('NULL', None), # Booleans ('true', True), ('True', True), ('TRUE', True), @@ -191,9 +192,9 @@ def test_item_tools_parse_item_value_basic(): ('|', []), # special case: lone '|' means empty ('alpha|', ['alpha']), ('7|', [7]), # special case: trailing '|' means singleton # These follow from general case of '|' as separator of items recursively parsed - ('|alpha', [None, 'alpha']), ('|alpha|', [None, 'alpha']), ('|7', [None, 7]), + ('|alpha', ['', 'alpha']), ('|alpha|', ['', 'alpha']), ('|7', ['', 7]), ('alpha|beta|gamma', ['alpha', 'beta', 'gamma']), - ('alpha|true|false|null||7|1.5', ['alpha', True, False, None, None, 7, 1.5]) + ('alpha|true|false|null||7|1.5', ['alpha', True, False, None, '', 7, 1.5]) ] for input, heuristic_result in expectations: @@ -271,6 +272,7 @@ def test_load_table_structures(): print("loaded=", json.dumps(loaded, indent=2)) expected = SAMPLE_JSON_TABS_FILE_ITEM_CONTENT print("expected=", json.dumps(expected, indent=2)) + # assert loaded == expected assert loaded == expected with pytest.raises(LoadArgumentsError) as exc: @@ -292,9 +294,13 @@ def test_load_items(): # mock_get_schema.return_value = {} with no_schemas(): - assert load_items(SAMPLE_XLSX_FILE, apply_heuristics=True) == SAMPLE_XLSX_FILE_ITEM_CONTENT - assert load_items(SAMPLE_CSV_FILE, apply_heuristics=True) == SAMPLE_CSV_FILE_ITEM_CONTENT - assert load_items(SAMPLE_TSV_FILE, apply_heuristics=True) == SAMPLE_TSV_FILE_ITEM_CONTENT + # TODO: Some work still to do WRT sorting out between None and empty strings in general. + assert load_items(SAMPLE_XLSX_FILE, apply_heuristics=True) == \ + remove_empty_properties(SAMPLE_XLSX_FILE_ITEM_CONTENT) + assert remove_empty_properties(load_items(SAMPLE_CSV_FILE, apply_heuristics=True), [None, '']) == \ + SAMPLE_CSV_FILE_ITEM_CONTENT + assert remove_empty_properties(load_items(SAMPLE_TSV_FILE, apply_heuristics=True), [None, '']) == \ + SAMPLE_TSV_FILE_ITEM_CONTENT with pytest.raises(LoadArgumentsError) as exc: load_items("something.else") @@ -329,7 +335,7 @@ def test_load_items(): SAMPLE_CSV_FILE2_SHEET_NAME: [ {"name": "john", "sex": "M", "member": False}, {"name": "juan", "sex": "male", "member": True}, - {"name": "igor", "sex": "unknown", "member": None}, + {"name": "igor", "sex": "unknown", "member": ""}, {"name": "mary", "sex": "Female", "member": "t"} ] } @@ -338,7 +344,7 @@ def test_load_items(): "Person": [ {"name": "john", "sex": "Male", "member": False}, {"name": "juan", "sex": "Male", "member": True}, - {"name": "igor", "sex": "unknown", "member": None}, + {"name": "igor", "sex": "unknown", "member": ""}, {"name": "mary", "sex": "Female", "member": True} ] } @@ -733,3 +739,19 @@ def test_table_checker(): flattened=True, portal_env=mock_ff_env) checker.check_tabs() + + +def remove_empty_properties(data: Optional[Union[list, dict]], + empty: Optional[List[Optional[str]]] = None) -> Optional[Union[list, dict]]: + if not empty: + empty = [None] + if isinstance(data, dict): + for key in list(data.keys()): + if (value := data[key]) in empty: + del data[key] + else: + remove_empty_properties(value, empty) + elif isinstance(data, list): + for item in data: + remove_empty_properties(item, empty) + return data diff --git a/test/test_qa_utils.py b/test/test_qa_utils.py index bed502166..6e9c606bf 100644 --- a/test/test_qa_utils.py +++ b/test/test_qa_utils.py @@ -568,7 +568,7 @@ def test_occasionally_errors(): error_message="something") fail_for_a_while(1) except Exception as e: - assert type(e) == SyntaxError + assert type(e) is SyntaxError assert str(e) == "something" diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index c012772c1..6cb2405e3 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -1,6 +1,7 @@ import json import os import pytest +from typing import Optional, Union from collections import namedtuple from dcicutils.sheet_utils import ( @@ -116,7 +117,7 @@ def test_table_set_manager_registry_manager_for_filename(): SAMPLE_XLSX_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.xlsx') -SAMPLE_XLSX_FILE_RAW_CONTENT = { +old_SAMPLE_XLSX_FILE_RAW_CONTENT = { "Sheet1": [ {"x": 1, "y.a": 1, "y.z": 1}, {"x": 1, "y.a": 2, "y.z": 3}, @@ -139,6 +140,29 @@ def test_table_set_manager_registry_manager_for_filename(): }, ] } +SAMPLE_XLSX_FILE_RAW_CONTENT = { + "Sheet1": [ + {"x": "1", "y.a": "1", "y.z": "1"}, + {"x": "1", "y.a": "2", "y.z": "3"}, + {"x": "alpha", "y.a": "beta", "y.z": "gamma|delta"}, + ], + "Sheet2": [ + { + "name": "bill", "age": "23", + "mother.name": "mary", "mother.age": "58", + "father.name": "fred", "father.age": "63", + "friends#0.name": "sam", "friends#0.age": "22", + "friends#1.name": "arthur", "friends#1.age": "19", + }, + { + "name": "joe", "age": "9", + "mother.name": "estrella", "mother.age": "35", + "father.name": "anthony", "father.age": "34", + "friends#0.name": "anders", "friends#0.age": "9", + "friends#1.name": None, "friends#1.age": None, + }, + ] +} SAMPLE_XLSX_FILE_INFLATED_CONTENT = { "Sheet1": [ @@ -243,11 +267,11 @@ def test_xlsx_manager_load_csv(): def test_csv_manager_load_content(): wt = CsvManager(SAMPLE_CSV_FILE) - assert wt.load_content() == SAMPLE_CSV_FILE_RAW_CONTENT + assert convert_number_to_string(wt.load_content()) == SAMPLE_CSV_FILE_RAW_CONTENT def test_csv_manager_load(): - assert CsvManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_RAW_CONTENT + assert convert_number_to_string(CsvManager.load(SAMPLE_CSV_FILE)) == SAMPLE_CSV_FILE_RAW_CONTENT def test_csv_manager_load_csv(): @@ -269,11 +293,11 @@ def test_csv_escaping(): def test_tsv_manager_load_content(): wt = TsvManager(SAMPLE_TSV_FILE) - assert wt.load_content() == SAMPLE_TSV_FILE_RAW_CONTENT + assert convert_number_to_string(wt.load_content()) == SAMPLE_TSV_FILE_RAW_CONTENT def test_tsv_manager_load(): - assert TsvManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_RAW_CONTENT + assert convert_number_to_string(TsvManager.load(SAMPLE_TSV_FILE)) == SAMPLE_TSV_FILE_RAW_CONTENT def test_tsv_manager_load_csv(): @@ -281,3 +305,16 @@ def test_tsv_manager_load_csv(): TsvManager.load(SAMPLE_XLSX_FILE) assert str(exc.value).startswith('The TableSetManager subclass TsvManager' ' expects only .tsv or .tsv.txt filenames:') + + +def convert_number_to_string(data: Optional[Union[list, dict]]) -> Optional[Union[list, dict]]: + if isinstance(data, dict): + for key in list(data.keys()): + if isinstance(value := data[key], int): + data[key] = str(value) + else: + convert_number_to_string(value) + elif isinstance(data, list): + for item in data: + convert_number_to_string(item) + return data From 701384f05e4b6f4809157b7804aada5f15b7fa4d Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 8 Nov 2023 22:54:34 -0500 Subject: [PATCH 06/49] Changes to bundle_utils/etc related to SMaHT ingestion work. --- dcicutils/validation_utils.py | 12 +++++------- test/test_validation_utils.py | 8 ++++---- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/dcicutils/validation_utils.py b/dcicutils/validation_utils.py index f6612067c..56e473501 100644 --- a/dcicutils/validation_utils.py +++ b/dcicutils/validation_utils.py @@ -87,16 +87,14 @@ def identifying_properties(self, schema: Optional[JsonSchema] = None, schema_nam return identifying_properties @classmethod # This operation doesn't actually use the schemas so is safe as a class method - def identifying_value(cls, data_item: Dict[str, AnyJsonData], identifying_properties) -> AnyJsonData: + def identifying_value(cls, data_item: Dict[str, AnyJsonData], + identifying_properties, raise_exception: bool = False) -> AnyJsonData: + if not identifying_properties and raise_exception: + raise ValueError("No identifying properties were specified.") for identifying_property in identifying_properties or []: if identifying_property in data_item: return data_item[identifying_property] - if False: - if not identifying_properties: - raise ValueError("No identifying properties were specified.") - for identifying_property in identifying_properties: - if identifying_property in data_item: - return data_item[identifying_property] + if raise_exception: raise ValueError(f'{there_are(identifying_properties, just_are=True)}' f' no {maybe_pluralize(identifying_properties, "identifying property")}' f' {disjoined_list([repr(x) for x in identifying_properties])}' diff --git a/test/test_validation_utils.py b/test/test_validation_utils.py index 40878a222..9129e05c0 100644 --- a/test/test_validation_utils.py +++ b/test/test_validation_utils.py @@ -73,20 +73,20 @@ def test_schema_manager_with_schemas(schema_id): def test_schema_manager_identifying_value(): with pytest.raises(ValueError) as exc: - assert SchemaManager.identifying_value({'any': 'thing'}, []) + assert SchemaManager.identifying_value({'any': 'thing'}, [], raise_exception=True) assert str(exc.value) == "No identifying properties were specified." person_named_fred = {'age': 33, 'name': 'Fred', 'favorite-color': 'yellow'} - assert SchemaManager.identifying_value(person_named_fred, ['uuid', 'name']) == 'Fred' + assert SchemaManager.identifying_value(person_named_fred, ['uuid', 'name'], raise_exception=True) == 'Fred' person_nicknamed_fred = {'age': 33, 'nickname': 'Fred', 'favorite-color': 'yellow'} with pytest.raises(ValueError) as exc: - SchemaManager.identifying_value(person_nicknamed_fred, ['uuid', 'name']) + SchemaManager.identifying_value(person_nicknamed_fred, ['uuid', 'name'], raise_exception=True) assert str(exc.value) == ("""There are no identifying properties 'uuid' or 'name'""" """ in {"age": 33, "nickname": "Fred", "favorite-color": "yellow"}.""") with pytest.raises(ValueError) as exc: - SchemaManager.identifying_value(person_nicknamed_fred, ['name']) + SchemaManager.identifying_value(person_nicknamed_fred, ['name'], raise_exception=True) assert str(exc.value) == ("""There is no identifying property 'name'""" """ in {"age": 33, "nickname": "Fred", "favorite-color": "yellow"}.""") From d49fd69f27e02a1f089e2cc96babc788156c9b3b Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 8 Nov 2023 22:57:45 -0500 Subject: [PATCH 07/49] Changes to bundle_utils/etc related to SMaHT ingestion work. --- dcicutils/sheet_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 370c22bd8..54ca37fac 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -443,7 +443,7 @@ def is_comment_row(row: Tuple[Optional[Any]]) -> bool: return False @staticmethod - def remove_trailing_none_values(values: list[Any]) -> list[Any]: + def remove_trailing_none_values(values: List[Any]) -> list[Any]: for index in range(len(values) - 1, -1, -1): if values[index] is not None: break From 4e21b859f6d13fe143612cddd139cfa0054f090b Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 8 Nov 2023 22:59:23 -0500 Subject: [PATCH 08/49] Changes to bundle_utils/etc related to SMaHT ingestion work. --- dcicutils/sheet_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 54ca37fac..771f06bcd 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -443,7 +443,7 @@ def is_comment_row(row: Tuple[Optional[Any]]) -> bool: return False @staticmethod - def remove_trailing_none_values(values: List[Any]) -> list[Any]: + def remove_trailing_none_values(values: List[Any]) -> List[Any]: for index in range(len(values) - 1, -1, -1): if values[index] is not None: break From 4e55f2914cdd85bdf9ecf423c4b25cfd92620173 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 9 Nov 2023 16:33:21 -0500 Subject: [PATCH 09/49] Changes to bundle_utils/etc related to SMaHT ingestion work. --- dcicutils/bundle_utils.py | 39 ++++++++++++++++++----------- dcicutils/sheet_utils.py | 20 ++++++++------- dcicutils/validation_utils.py | 8 +++--- test/data_files/escaping-false.json | 16 ++++++------ test/data_files/escaping-true.json | 16 ++++++------ test/test_bundle_utils.py | 7 +++--- test/test_sheet_utils.py | 28 ++++++++++----------- 7 files changed, 75 insertions(+), 59 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index 04c6607f0..6e0dc4553 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -74,11 +74,11 @@ def apply_hint(self, value): class NumHint(TypeHint): - PREFERENCE_MAP = {'number': 'num', 'integer': 'int', 'float': 'float'} + PREFERENCE_MAP = {'number': 'number', 'integer': 'integer'} def __init__(self, declared_type: Optional[str] = None): if declared_type is None: - declared_type = 'num' + declared_type = 'number' self.preferred_type = self.PREFERENCE_MAP.get(declared_type) def apply_hint(self, value): @@ -347,7 +347,7 @@ def find_type_hint_for_subschema(cls, subschema: Any, context: Optional[TypeHint if link_to and context.schema_exists(link_to): return RefHint(schema_name=link_to, context=context) return StringHint() - elif t in ('integer', 'float', 'number'): + elif t in ('integer', 'number'): return NumHint(declared_type=t) elif t == 'boolean': return BoolHint() @@ -452,7 +452,8 @@ class TableChecker(InflatableTabbedDataManager, TypeHintContext): def __init__(self, tabbed_sheet_data: TabbedSheetData, *, flattened: bool, override_schemas: Optional[TabbedJsonSchemas] = None, portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None, - apply_heuristics: bool = False): + apply_heuristics: bool = False, + noschemas: bool = False): self.flattened = flattened # if not flattened: @@ -462,7 +463,7 @@ def __init__(self, tabbed_sheet_data: TabbedSheetData, *, flattened: bool, # # -kmp 25-Oct-2023 # raise ValueError("Only flattened=True is supported by TableChecker for now.") - if portal_env is None and portal_vapp is None: + if portal_env is None and portal_vapp is None and not noschemas: portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) # InflatableTabbedDataManager supplies: # self.tabbed_sheet_data: TabbedSheetData = @@ -473,9 +474,14 @@ def __init__(self, tabbed_sheet_data: TabbedSheetData, *, flattened: bool, super().__init__(tabbed_sheet_data=tabbed_sheet_data, apply_heuristics=apply_heuristics) self.portal_env = portal_env self.portal_vapp = portal_vapp - self.schema_manager: SchemaManager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp, - override_schemas=override_schemas) - self.schemas = self.schema_manager.fetch_relevant_schemas(self.tab_names) # , schemas=schemas) + if not noschemas: + self.schema_manager: SchemaManager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp, + override_schemas=override_schemas, noschemas=noschemas) + schema_names_to_fetch = [key for key, value in tabbed_sheet_data.items() if value] + self.schemas = self.schema_manager.fetch_relevant_schemas(schema_names_to_fetch) if not noschemas else None # , schemas=schemas) + else: + self.schema_manager = None + self.schemas = {} self.lookup_tables_by_tab_name: Dict[str, Dict[str, Dict]] = { tab_name: self.build_lookup_table_for_tab(tab_name, rows=rows) for tab_name, rows in tabbed_sheet_data.items() @@ -496,6 +502,8 @@ def note_problem(self, problem: str): self._problems.append(problem) def build_lookup_table_for_tab(self, tab_name: str, *, rows: List[Dict]) -> Dict[str, Dict]: + if not self.schema_manager or not rows: + return [] identifying_properties = self.schema_manager.identifying_properties(schema_name=tab_name) if not identifying_properties: # Maybe issue a warning here that we're going to lose @@ -554,7 +562,7 @@ def validate_ref(self, item_type, item_ref): return False def schema_exists(self, schema_name: str) -> bool: - return self.schema_manager.schema_exists(schema_name) + return self.schema_manager.schema_exists(schema_name) if self.schema_manager else False def check_tab(self, tab_name: str): prototype = self.patch_prototypes_by_tab_name[tab_name] @@ -617,10 +625,11 @@ def check(cls, tabbed_sheet_data: TabbedSheetData, *, flattened: bool, override_schemas: Optional[TabbedJsonSchemas] = None, apply_heuristics: bool = False, + noschemas: bool = False, portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None): checker = cls(tabbed_sheet_data, flattened=flattened, override_schemas=override_schemas, apply_heuristics=apply_heuristics, - portal_env=portal_env, portal_vapp=portal_vapp) + portal_env=portal_env, portal_vapp=portal_vapp, noschemas=noschemas) checked = checker.check_tabs() return checked @@ -654,7 +663,7 @@ def create_tab_processor_state(self, tab_name: str) -> SheetState: type_hints=self.type_hints_by_tab_name[tab_name]) -check = TableChecker.check +# check = TableChecker.check def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, @@ -664,6 +673,7 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional # but for production use maybe should not be? -kmp 25-Oct-2023 validate: bool = False, retain_empty_properties: bool = False, + noschemas: bool = False, **kwargs): annotated_data = TableSetManager.load_annotated(filename=filename, tab_name=tab_name, escaping=escaping, prefer_number=False, **kwargs) @@ -673,14 +683,15 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional checked_items = TableChecker.check(tabbed_rows, flattened=flattened, override_schemas=override_schemas, portal_env=portal_env, portal_vapp=portal_vapp, - apply_heuristics=apply_heuristics) + apply_heuristics=apply_heuristics, + noschemas=noschemas) else: # No fancy checking for things like .json, etc. for now. Only check things that came from # spreadsheet-like data, where structural datatypes are forced into strings. checked_items = tabbed_rows if not retain_empty_properties: remove_empty_properties(checked_items) - if validate: + if validate and not noschemas: problems = validate_data_against_schemas(checked_items, portal_env=portal_env, portal_vapp=portal_vapp, override_schemas=override_schemas) return checked_items, problems @@ -690,7 +701,7 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional def remove_empty_properties(data: Optional[Union[list, dict]]) -> None: if isinstance(data, dict): for key in list(data.keys()): - if (value := data[key]) is None: + if (value := data[key]) in [None, ""]: del data[key] else: remove_empty_properties(value) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 771f06bcd..b3f191f15 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -70,7 +70,7 @@ def unwanted_kwargs(*, context, kwargs, context_plural=False, detailed=False): f" {maybe_pluralize(unwanted, 'keyword argument')} {conjoined_list(unwanted)}.") -def prefer_number(value: SheetCellValue, kind='num'): +def prefer_number(value: SheetCellValue, kind='number'): """ Given a string, if the string has number syntax, returns the number it represents. Otherwise, returns its argument. (It follows from this that if given an int or a float, it just returns that argument.) @@ -79,7 +79,7 @@ def prefer_number(value: SheetCellValue, kind='num'): is coerced to, but it has no effect when the argument is a number, even a number of the wrong kind. :param value: a string, int, or float - :param kind: one of 'num', 'int', or 'float' + :param kind: one of 'number' or 'integer' :returns: the argument coerced to a number of the appropriate kind, if possible, or else the argument literally """ if isinstance(value, str): # the given value might be an int or float, in which case just fall through @@ -88,12 +88,12 @@ def prefer_number(value: SheetCellValue, kind='num'): value = value ch0 = value[0] if ch0 == '+' or ch0 == '-' or ch0.isdigit(): - if kind == 'num' or kind == 'int': + if kind == 'integer': try: return int(value) except Exception: pass - if kind == 'num' or kind == 'float': + if kind == 'number': try: return float(value) except Exception: @@ -302,7 +302,7 @@ def load_content(self) -> AnyJsonData: if processed_row_data is CommentRow: continue sheet_content.append(processed_row_data) - self.content_by_tab_name[tab_name] = sheet_content + self.content_by_tab_name[tab_name.replace(" ", "")] = sheet_content return self.content_by_tab_name def parse_cell_value(self, value: SheetCellValue, override_prefer_number: Optional[bool] = None) -> AnyJsonData: @@ -395,8 +395,10 @@ def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow: results = [] for col in self._all_cols(sheet): value = sheet.cell(row=row, column=col).value - if value is not None and XlsxManager.CONVERT_VALUES_TO_STRING: - value = str(value).strip() +# if value is not None and XlsxManager.CONVERT_VALUES_TO_STRING: +# value = str(value).strip() + if XlsxManager.CONVERT_VALUES_TO_STRING: + value = str(value).strip() if value is not None else "" results.append(value) return results @@ -642,10 +644,10 @@ def _escape_cell_text(cls, cell_text): def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: ignored(tab_name) if self.escaping: - return {headers[i]: self.parse_cell_value(self._escape_cell_text(cell_text)) + return {headers[i]: self.parse_cell_value(self._escape_cell_text(cell_text), override_prefer_number=False) for i, cell_text in enumerate(row_data)} else: - return {headers[i]: self.parse_cell_value(cell_text) + return {headers[i]: self.parse_cell_value(cell_text, override_prefer_number=False) for i, cell_text in enumerate(row_data)} diff --git a/dcicutils/validation_utils.py b/dcicutils/validation_utils.py index 56e473501..e3a0e050c 100644 --- a/dcicutils/validation_utils.py +++ b/dcicutils/validation_utils.py @@ -27,10 +27,10 @@ def fresh_schema_manager_context_for_testing(cls): # finally: # cls.SCHEMA_CACHE = old_schema_cache - def __init__(self, *, override_schemas: Optional[TabbedJsonSchemas] = None, + def __init__(self, *, override_schemas: Optional[TabbedJsonSchemas] = None, noschemas: bool = False, portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None): self.SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it. - if portal_env is None and portal_vapp is None: + if portal_env is None and portal_vapp is None and not noschemas: portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.") self.portal_env = portal_env @@ -174,7 +174,9 @@ def validate_data_against_schemas(data: TabbedSheetData, *, schema_manager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp, override_schemas=override_schemas) errors = [] - schemas = schema_manager.fetch_relevant_schemas(list(data.keys())) + + schema_names_to_fetch = [key for key, value in data.items() if value] + schemas = schema_manager.fetch_relevant_schemas(schema_names_to_fetch) for data_type in data: schema = schemas.get(data_type) diff --git a/test/data_files/escaping-false.json b/test/data_files/escaping-false.json index 84ab06993..3c95926d9 100644 --- a/test/data_files/escaping-false.json +++ b/test/data_files/escaping-false.json @@ -5,56 +5,56 @@ "unquoted": "\\\\", "doublequoted": "\\\\", "singlequoted": "'\\\\'", - "overflow": null + "overflow": "" }, { "name": "formfeed", "unquoted": "\\f", "doublequoted": "\\f", "singlequoted": "'\\f'", - "overflow": null + "overflow": "" }, { "name": "newline", "unquoted": "\\n", "doublequoted": "\\n", "singlequoted": "'\\n'", - "overflow": null + "overflow": "" }, { "name": "return", "unquoted": "\\r", "doublequoted": "\\r", "singlequoted": "'\\r'", - "overflow": null + "overflow": "" }, { "name": "tab", "unquoted": "\\t", "doublequoted": "\\t", "singlequoted": "'\\t'", - "overflow": null + "overflow": "" }, { "name": "misc", "unquoted": "\\m", "doublequoted": "\\m", "singlequoted": "'\\m'", - "overflow": null + "overflow": "" }, { "name": "quote1", "unquoted": "N/A", "doublequoted": "x,,z", "singlequoted": "N/A", - "overflow": null + "overflow": "" }, { "name": "quotelong", "unquoted": "N/A", "doublequoted": "x,,z,N/A\nquotlongcontinued,", "singlequoted": "N/A", - "overflow": null + "overflow": "" }, { "name": "comma", diff --git a/test/data_files/escaping-true.json b/test/data_files/escaping-true.json index 5d6c837a6..bae122afd 100644 --- a/test/data_files/escaping-true.json +++ b/test/data_files/escaping-true.json @@ -5,56 +5,56 @@ "unquoted": "\\", "doublequoted": "\\", "singlequoted": "'\\'", - "overflow": null + "overflow": "" }, { "name": "formfeed", "unquoted": "\\f", "doublequoted": "\\f", "singlequoted": "'\\f'", - "overflow": null + "overflow": "" }, { "name": "newline", "unquoted": "\n", "doublequoted": "\n", "singlequoted": "'\n'", - "overflow": null + "overflow": "" }, { "name": "return", "unquoted": "\r", "doublequoted": "\r", "singlequoted": "'\r'", - "overflow": null + "overflow": "" }, { "name": "tab", "unquoted": "\t", "doublequoted": "\t", "singlequoted": "'\t'", - "overflow": null + "overflow": "" }, { "name": "misc", "unquoted": "\\m", "doublequoted": "\\m", "singlequoted": "'\\m'", - "overflow": null + "overflow": "" }, { "name": "quote1", "unquoted": "N/A", "doublequoted": "x,,z", "singlequoted": "N/A", - "overflow": null + "overflow": "" }, { "name": "quotelong", "unquoted": "N/A", "doublequoted": "x,,z,N/A\nquotlongcontinued,", "singlequoted": "N/A", - "overflow": null + "overflow": "" }, { "name": "comma", diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py index c739c8504..89de7b8a0 100644 --- a/test/test_bundle_utils.py +++ b/test/test_bundle_utils.py @@ -326,7 +326,7 @@ def test_load_items(): SAMPLE_CSV_FILE2_SHEET_NAME: [ {"name": "john", "sex": "M", "member": "false"}, {"name": "juan", "sex": "male", "member": "true"}, - {"name": "igor", "sex": "unknown", "member": None}, + {"name": "igor", "sex": "unknown", "member": ""}, {"name": "mary", "sex": "Female", "member": "t"} ] } @@ -335,7 +335,7 @@ def test_load_items(): SAMPLE_CSV_FILE2_SHEET_NAME: [ {"name": "john", "sex": "M", "member": False}, {"name": "juan", "sex": "male", "member": True}, - {"name": "igor", "sex": "unknown", "member": ""}, + {"name": "igor", "sex": "unknown"}, {"name": "mary", "sex": "Female", "member": "t"} ] } @@ -344,7 +344,7 @@ def test_load_items(): "Person": [ {"name": "john", "sex": "Male", "member": False}, {"name": "juan", "sex": "Male", "member": True}, - {"name": "igor", "sex": "unknown", "member": ""}, + {"name": "igor", "sex": "unknown"}, {"name": "mary", "sex": "Female", "member": True} ] } @@ -466,6 +466,7 @@ def test_load_items_with_schema(): print("Case 3") expected = SAMPLE_CSV_FILE2_PERSON_CONTENT_HINTED actual = load_items(SAMPLE_CSV_FILE2, override_schemas=SAMPLE_CSV_FILE2_SCHEMAS, tab_name='Person') + import pdb ; pdb.set_trace() assert actual == expected diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index 6cb2405e3..276b17b23 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -117,25 +117,25 @@ def test_table_set_manager_registry_manager_for_filename(): SAMPLE_XLSX_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.xlsx') -old_SAMPLE_XLSX_FILE_RAW_CONTENT = { +backup_SAMPLE_XLSX_FILE_RAW_CONTENT = { "Sheet1": [ - {"x": 1, "y.a": 1, "y.z": 1}, - {"x": 1, "y.a": 2, "y.z": 3}, + {"x": "1", "y.a": "1", "y.z": "1"}, + {"x": "1", "y.a": "2", "y.z": "3"}, {"x": "alpha", "y.a": "beta", "y.z": "gamma|delta"}, ], "Sheet2": [ { - "name": "bill", "age": 23, - "mother.name": "mary", "mother.age": 58, - "father.name": "fred", "father.age": 63, - "friends#0.name": "sam", "friends#0.age": 22, - "friends#1.name": "arthur", "friends#1.age": 19, + "name": "bill", "age": "23", + "mother.name": "mary", "mother.age": "58", + "father.name": "fred", "father.age": "63", + "friends#0.name": "sam", "friends#0.age": "22", + "friends#1.name": "arthur", "friends#1.age": "19", }, { - "name": "joe", "age": 9, - "mother.name": "estrella", "mother.age": 35, - "father.name": "anthony", "father.age": 34, - "friends#0.name": "anders", "friends#0.age": 9, + "name": "joe", "age": "9", + "mother.name": "estrella", "mother.age": "35", + "father.name": "anthony", "father.age": "34", + "friends#0.name": "anders", "friends#0.age": "9", "friends#1.name": None, "friends#1.age": None, }, ] @@ -159,7 +159,7 @@ def test_table_set_manager_registry_manager_for_filename(): "mother.name": "estrella", "mother.age": "35", "father.name": "anthony", "father.age": "34", "friends#0.name": "anders", "friends#0.age": "9", - "friends#1.name": None, "friends#1.age": None, + "friends#1.name": "", "friends#1.age": "", }, ] } @@ -186,7 +186,7 @@ def test_table_set_manager_registry_manager_for_filename(): "father": {"name": "anthony", "age": 34}, "friends": [ {"name": "anders", "age": 9}, - {"name": None, "age": None} + {"name": "", "age": ""} ] }, ], From 23d29e82f742cf89cec741af51af15d272189738 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 11 Nov 2023 14:59:02 -0500 Subject: [PATCH 10/49] Changes to bundle_utils/etc related to SMaHT ingestion work. --- dcicutils/sheet_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index b3f191f15..c0d7d0a80 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -432,7 +432,7 @@ def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> A @staticmethod def is_terminating_row(row: List[Optional[Any]]) -> bool: # TODO: This is may change; currently an all blank row signals the end of input. - return all(cell is None for cell in row) and XlsxManager.TERMINATE_ON_EMPTY_ROW + return all(cell is None or cell == "" for cell in row) and XlsxManager.TERMINATE_ON_EMPTY_ROW @staticmethod def is_comment_row(row: Tuple[Optional[Any]]) -> bool: From d80817e1b5c6f8d2387ee4da1ff66cb5cfa1cc88 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 11 Nov 2023 23:09:13 -0500 Subject: [PATCH 11/49] Changes to bundle_utils/etc related to SMaHT ingestion work. --- dcicutils/bundle_utils.py | 51 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index 6e0dc4553..92953b5b3 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -1,3 +1,4 @@ +import json import copy from typing import Any, Dict, List, Optional, Tuple, Union @@ -45,6 +46,9 @@ def __init__(self, problems: Optional[dict] = None): class TypeHint: + def __init__(self): + self.is_array = False + def apply_hint(self, value): return value @@ -56,6 +60,8 @@ def __repr__(self): class BoolHint(TypeHint): + def __init__(self): + super().__init__() # We could use other ways to do this, such as initial substring, but this is more likely to be right. # Then again, we might want to consder accepting athers like 'yes/no', 'y/n', 'on/off', '1/0'. @@ -80,6 +86,7 @@ def __init__(self, declared_type: Optional[str] = None): if declared_type is None: declared_type = 'number' self.preferred_type = self.PREFERENCE_MAP.get(declared_type) + super().__init__() def apply_hint(self, value): if isinstance(value, str) and value: @@ -97,6 +104,7 @@ def __str__(self): def __init__(self, value_map): self.value_map = value_map + super().__init__() def apply_hint(self, value): if isinstance(value, str): @@ -117,6 +125,9 @@ def apply_hint(self, value): class ArrayHint(TypeHint): + def __init__(self): + super().__init__() + def apply_hint(self, value): if value is None or value == '': return [] @@ -126,6 +137,9 @@ def apply_hint(self, value): class StringHint(TypeHint): + def __init__(self): + super().__init__() + def apply_hint(self, value): return str(value).strip() if value is not None else "" @@ -138,9 +152,16 @@ def __str__(self): def __init__(self, schema_name: str, context: TypeHintContext): self.schema_name = schema_name self.context = context + super().__init__() def apply_hint(self, value): - if not self.context.validate_ref(item_type=self.schema_name, item_ref=value): + if self.is_array: + value = [value.strip() for value in value.split(ARRAY_VALUE_DELIMITER)] + for item in value: + if item and not self.context.validate_ref(item_type=self.schema_name, item_ref=item): + raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {item!r}") + return value + if value and not self.context.validate_ref(item_type=self.schema_name, item_ref=value): raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {value!r}") return value @@ -352,6 +373,10 @@ def find_type_hint_for_subschema(cls, subschema: Any, context: Optional[TypeHint elif t == 'boolean': return BoolHint() elif t == 'array': + array_type_hint = cls.find_type_hint_for_subschema(subschema.get("items"), context) + if type(array_type_hint) == RefHint: + array_type_hint.is_array = True + return array_type_hint return ArrayHint() @classmethod @@ -527,7 +552,15 @@ def contains_ref(self, item_type, item_ref): def resolve_ref(self, item_type, item_ref): lookup_table = self.lookup_tables_by_tab_name.get(item_type) if lookup_table: # Is it a type we're tracking? - return lookup_table.get(item_ref) or None + if isinstance(item_ref, list): + found = True + for item in item_ref: + if not lookup_table.get(item): + found = False + break + return True if found else None + else: + return lookup_table.get(item_ref) or None else: # Apparently some stray type not in our tables return None @@ -553,8 +586,18 @@ def validate_ref(self, item_type, item_ref): return True try: # TODO: This probably needs a cache - info = get_metadata(f"/{to_camel_case(item_type)}/{item_ref}", - ff_env=self.portal_env, vapp=self.portal_vapp) + if isinstance(item_ref, list): + found = True + for item in item_ref: + info = get_metadata(f"/{to_camel_case(item_type)}/{item_ref}", + ff_env=self.portal_env, vapp=self.portal_vapp) + if not isinstance(info, dict) or 'uuid' not in info: + found = False + break + return found + else: + info = get_metadata(f"/{to_camel_case(item_type)}/{item_ref}", + ff_env=self.portal_env, vapp=self.portal_vapp) # Basically return True if there's a value at all, # but still check it's not an error message that didn't get raised. return isinstance(info, dict) and 'uuid' in info From 7260521cc3abfd6d4f7ee9d73b81758f4a87e924 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 11 Nov 2023 23:10:13 -0500 Subject: [PATCH 12/49] Changes to bundle_utils/etc related to SMaHT ingestion work. --- test/test_bundle_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py index 89de7b8a0..3edd4acc0 100644 --- a/test/test_bundle_utils.py +++ b/test/test_bundle_utils.py @@ -466,7 +466,6 @@ def test_load_items_with_schema(): print("Case 3") expected = SAMPLE_CSV_FILE2_PERSON_CONTENT_HINTED actual = load_items(SAMPLE_CSV_FILE2, override_schemas=SAMPLE_CSV_FILE2_SCHEMAS, tab_name='Person') - import pdb ; pdb.set_trace() assert actual == expected From 676b7ceb57343052fd8166f619144adcc4ecae75 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Mon, 13 Nov 2023 14:52:38 -0500 Subject: [PATCH 13/49] Minor sheet_utils updates related to SMaHT ingestion. --- dcicutils/bundle_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index 92953b5b3..82e115e65 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -132,6 +132,8 @@ def apply_hint(self, value): if value is None or value == '': return [] if isinstance(value, str): + if not value: + return [] return [value.strip() for value in value.split(ARRAY_VALUE_DELIMITER)] return super().apply_hint(value) @@ -156,7 +158,7 @@ def __init__(self, schema_name: str, context: TypeHintContext): def apply_hint(self, value): if self.is_array: - value = [value.strip() for value in value.split(ARRAY_VALUE_DELIMITER)] + value = [value.strip() for value in value.split(ARRAY_VALUE_DELIMITER)] if value else [] for item in value: if item and not self.context.validate_ref(item_type=self.schema_name, item_ref=item): raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {item!r}") @@ -589,7 +591,7 @@ def validate_ref(self, item_type, item_ref): if isinstance(item_ref, list): found = True for item in item_ref: - info = get_metadata(f"/{to_camel_case(item_type)}/{item_ref}", + info = get_metadata(f"/{to_camel_case(item_type)}/{item}", ff_env=self.portal_env, vapp=self.portal_vapp) if not isinstance(info, dict) or 'uuid' not in info: found = False From 352c8016fde503ba5e7cf2afad6b657df0c55bc7 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Mon, 13 Nov 2023 14:58:26 -0500 Subject: [PATCH 14/49] lint fixes --- dcicutils/bundle_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index 82e115e65..ddb8fda68 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -1,4 +1,3 @@ -import json import copy from typing import Any, Dict, List, Optional, Tuple, Union @@ -505,7 +504,7 @@ def __init__(self, tabbed_sheet_data: TabbedSheetData, *, flattened: bool, self.schema_manager: SchemaManager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp, override_schemas=override_schemas, noschemas=noschemas) schema_names_to_fetch = [key for key, value in tabbed_sheet_data.items() if value] - self.schemas = self.schema_manager.fetch_relevant_schemas(schema_names_to_fetch) if not noschemas else None # , schemas=schemas) + self.schemas = self.schema_manager.fetch_relevant_schemas(schema_names_to_fetch) if not noschemas else None else: self.schema_manager = None self.schemas = {} From 28b780bf75bc2cfa8767c3bc3d75f0efbcc31d72 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Tue, 14 Nov 2023 12:56:20 -0500 Subject: [PATCH 15/49] SMaHT ingestion work. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 122ce5164..259063fbb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "8.2.0.1b1" +version = "8.2.0.1b1" # TODO: To become 8.3.0 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From 8d3d4322a46ca33259ae1f81acd0eda288befe4c Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 15 Nov 2023 10:59:10 -0500 Subject: [PATCH 16/49] Support ordering in sheet_utils. --- dcicutils/bundle_utils.py | 3 ++- dcicutils/sheet_utils.py | 39 ++++++++++++++++++++++++--------------- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index ddb8fda68..15863a805 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -718,9 +718,10 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional validate: bool = False, retain_empty_properties: bool = False, noschemas: bool = False, + sheet_order: Optional[List[str]] = None, **kwargs): annotated_data = TableSetManager.load_annotated(filename=filename, tab_name=tab_name, escaping=escaping, - prefer_number=False, **kwargs) + prefer_number=False, sheet_order=sheet_order, **kwargs) tabbed_rows = annotated_data['content'] flattened = annotated_data['flattened'] if flattened: diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index c0d7d0a80..07b15a993 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -17,7 +17,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union from .common import AnyJsonData, Regexp, JsonSchema from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize -from .misc_utils import ignored, pad_to, JsonLinesReader, remove_suffix +from .misc_utils import ignored, pad_to, JsonLinesReader, remove_suffix, to_camel_case Header = str @@ -199,10 +199,10 @@ def load(cls, filename: str, **kwargs) -> TabbedSheetData: raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.") # noQA @property - def tab_names(self) -> List[str]: + def tab_names(self, order: Optional[List[str]] = None) -> List[str]: raise NotImplementedError(f".tab_names is not implemented for {self.__class__.__name__}..") # noQA - def load_content(self) -> Any: + def load_content(self, sheet_order: Optional[List[str]] = None) -> Any: raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.") # noQA @@ -291,8 +291,8 @@ def _process_row(self, tab_name: str, state: Any, row: List[SheetCellValue]) -> """ raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.") # noQA - def load_content(self) -> AnyJsonData: - for tab_name in self.tab_names: + def load_content(self, sheet_order: Optional[List[str]] = None) -> AnyJsonData: + for tab_name in self.tab_names(sheet_order): sheet_content = [] state = self._create_tab_processor_state(tab_name) for row_data in self._raw_row_generator_for_tab_name(tab_name): @@ -378,9 +378,20 @@ def _all_cols(cls, sheet: Worksheet): for col in range(1, col_max + 1): yield col - @property - def tab_names(self) -> List[str]: - return self.reader_agent.sheetnames + def tab_names(self, order: Optional[List[str]] = None) -> List[str]: + def ordered_sheet_names(sheet_names: List[str]) -> List[str]: + if not order: + return sheet_names + ordered_sheet_names = [] + for item in order: + for sheet_name in sheet_names: + if to_camel_case(item.replace(" ", "")) == to_camel_case(sheet_name.replace(" ", "")): + ordered_sheet_names.append(sheet_name) + for sheet_name in sheet_names: + if sheet_name not in ordered_sheet_names: + ordered_sheet_names.append(sheet_name) + return ordered_sheet_names + return ordered_sheet_names(self.reader_agent.sheetnames) def _get_reader_agent(self) -> Workbook: return openpyxl.load_workbook(self.filename) @@ -464,8 +475,7 @@ def __init__(self, filename: str, tab_name: Optional[str] = None, **kwargs): self._tab_name = tab_name or infer_tab_name_from_filename(filename) super().__init__(filename=filename, **kwargs) - @property - def tab_names(self) -> List[str]: + def tab_names(self, order: Optional[List[str]] = None) -> List[str]: return [self._tab_name] @@ -492,8 +502,7 @@ def _wrap_inserts_data(cls, filename: str, data: AnyJsonData) -> AnyJsonData: ignored(filename) return data - @property - def tab_names(self) -> List[str]: + def tab_names(self, order: Optional[List[str]] = None) -> List[str]: return list(self.content_by_tab_name.keys()) def _get_reader_agent(self) -> Any: @@ -512,7 +521,7 @@ def extract_tabbed_headers(cls, data: TabbedSheetData) -> TabbedHeaders: result[tab] = headers return result - def load_content(self) -> Dict[str, AnyJsonData]: + def load_content(self, sheet_order: Optional[List[str]] = None) -> Dict[str, AnyJsonData]: data = self._load_inserts_data(self.filename) self.content_by_tab_name = data self.headers_by_tab_name = self.extract_tabbed_headers(data) @@ -743,7 +752,7 @@ def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[ @classmethod def load_annotated(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, - retain_empty_properties: bool = False, + retain_empty_properties: bool = False, sheet_order: Optional[List[str]] = None, **kwargs) -> Dict: """ Given a filename and various options @@ -752,7 +761,7 @@ def load_annotated(cls, filename: str, tab_name: Optional[str] = None, escaping: with maybe_unpack(filename) as filename: manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, **kwargs) - content: TabbedSheetData = manager.load_content() + content: TabbedSheetData = manager.load_content(sheet_order) return { 'filename': filename, 'content': content, From 728c5bbd22baaeabdf6a0df319944006b3006c2e Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 15 Nov 2023 16:52:23 -0500 Subject: [PATCH 17/49] Minor sheet_utils updates related to SMaHT ingestion. --- dcicutils/bundle_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index 15863a805..5240790b7 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -162,6 +162,10 @@ def apply_hint(self, value): if item and not self.context.validate_ref(item_type=self.schema_name, item_ref=item): raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {item!r}") return value + self._apply_ref_hint(value) + return value + + def _apply_ref_hint(self, value): if value and not self.context.validate_ref(item_type=self.schema_name, item_ref=value): raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {value!r}") return value From bb7ebc909bceb30f5d487e1e4aa5ebcea98a16cc Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 15 Nov 2023 17:06:57 -0500 Subject: [PATCH 18/49] Minor sheet_utils updates related to SMaHT ingestion. --- dcicutils/bundle_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index 5240790b7..2f2b33987 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -750,7 +750,7 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional def remove_empty_properties(data: Optional[Union[list, dict]]) -> None: if isinstance(data, dict): for key in list(data.keys()): - if (value := data[key]) in [None, ""]: + if (value := data[key]) in [None, "", {}, []]: del data[key] else: remove_empty_properties(value) From beb9cf7a735781a6440602c3c38bd30584cdd8b6 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 15 Nov 2023 17:35:08 -0500 Subject: [PATCH 19/49] Minor sheet_utils updates related to SMaHT ingestion. --- dcicutils/bundle_utils.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index 2f2b33987..c9066f2ec 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -158,14 +158,15 @@ def __init__(self, schema_name: str, context: TypeHintContext): def apply_hint(self, value): if self.is_array: value = [value.strip() for value in value.split(ARRAY_VALUE_DELIMITER)] if value else [] - for item in value: - if item and not self.context.validate_ref(item_type=self.schema_name, item_ref=item): - raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {item!r}") return value self._apply_ref_hint(value) return value def _apply_ref_hint(self, value): + if self.is_array: + for item in value: + if item and not self.context.validate_ref(item_type=self.schema_name, item_ref=item): + raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {item!r}") if value and not self.context.validate_ref(item_type=self.schema_name, item_ref=value): raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {value!r}") return value @@ -269,7 +270,7 @@ def parse_sheet_headers(cls, headers: Headers) -> ParsedHeaders: def compute_patch_prototype(cls, parsed_headers: ParsedHeaders): prototype = {} for parsed_header in parsed_headers: - parsed_header0 = parsed_header[0] + parsed_header0 = parsed_header[0] if parsed_header else "" if isinstance(parsed_header0, int): raise LoadTableError(f"A header cannot begin with a numeric ref: {parsed_header0}") cls.assure_patch_prototype_shape(parent=prototype, keys=parsed_header) @@ -277,7 +278,9 @@ def compute_patch_prototype(cls, parsed_headers: ParsedHeaders): @classmethod def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: ParsedHeader): - [key0, *more_keys] = keys + key0 = None + more_keys = None + [key0, *more_keys] = keys if keys else [None] key1 = more_keys[0] if more_keys else None if isinstance(key1, int): placeholder = [] @@ -354,7 +357,7 @@ def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any # if (value is None or value == '') and not force: if value is None and not force: return - [key, *more_path] = path + [key, *more_path] = path if path else [None] if not more_path: datum[key] = value else: From 023bc78d4b2cd706f993641f7b689ddd8a61c42f Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 16 Nov 2023 08:56:40 -0500 Subject: [PATCH 20/49] Minor sheet_utils updates related to SMaHT ingestion. --- dcicutils/bundle_utils.py | 26 +++++++++----------------- dcicutils/validation_utils.py | 10 +++++++--- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index c9066f2ec..93b08a2e0 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -485,8 +485,7 @@ class TableChecker(InflatableTabbedDataManager, TypeHintContext): def __init__(self, tabbed_sheet_data: TabbedSheetData, *, flattened: bool, override_schemas: Optional[TabbedJsonSchemas] = None, portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None, - apply_heuristics: bool = False, - noschemas: bool = False): + apply_heuristics: bool = False): self.flattened = flattened # if not flattened: @@ -496,7 +495,7 @@ def __init__(self, tabbed_sheet_data: TabbedSheetData, *, flattened: bool, # # -kmp 25-Oct-2023 # raise ValueError("Only flattened=True is supported by TableChecker for now.") - if portal_env is None and portal_vapp is None and not noschemas: + if portal_env is None and portal_vapp is None: portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) # InflatableTabbedDataManager supplies: # self.tabbed_sheet_data: TabbedSheetData = @@ -507,14 +506,10 @@ def __init__(self, tabbed_sheet_data: TabbedSheetData, *, flattened: bool, super().__init__(tabbed_sheet_data=tabbed_sheet_data, apply_heuristics=apply_heuristics) self.portal_env = portal_env self.portal_vapp = portal_vapp - if not noschemas: - self.schema_manager: SchemaManager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp, - override_schemas=override_schemas, noschemas=noschemas) - schema_names_to_fetch = [key for key, value in tabbed_sheet_data.items() if value] - self.schemas = self.schema_manager.fetch_relevant_schemas(schema_names_to_fetch) if not noschemas else None - else: - self.schema_manager = None - self.schemas = {} + self.schema_manager: SchemaManager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp, + override_schemas=override_schemas) + schema_names_to_fetch = [key for key, value in tabbed_sheet_data.items() if value] + self.schemas = self.schema_manager.fetch_relevant_schemas(schema_names_to_fetch) self.lookup_tables_by_tab_name: Dict[str, Dict[str, Dict]] = { tab_name: self.build_lookup_table_for_tab(tab_name, rows=rows) for tab_name, rows in tabbed_sheet_data.items() @@ -676,11 +671,10 @@ def check(cls, tabbed_sheet_data: TabbedSheetData, *, flattened: bool, override_schemas: Optional[TabbedJsonSchemas] = None, apply_heuristics: bool = False, - noschemas: bool = False, portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None): checker = cls(tabbed_sheet_data, flattened=flattened, override_schemas=override_schemas, apply_heuristics=apply_heuristics, - portal_env=portal_env, portal_vapp=portal_vapp, noschemas=noschemas) + portal_env=portal_env, portal_vapp=portal_vapp) checked = checker.check_tabs() return checked @@ -724,7 +718,6 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional # but for production use maybe should not be? -kmp 25-Oct-2023 validate: bool = False, retain_empty_properties: bool = False, - noschemas: bool = False, sheet_order: Optional[List[str]] = None, **kwargs): annotated_data = TableSetManager.load_annotated(filename=filename, tab_name=tab_name, escaping=escaping, @@ -735,15 +728,14 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional checked_items = TableChecker.check(tabbed_rows, flattened=flattened, override_schemas=override_schemas, portal_env=portal_env, portal_vapp=portal_vapp, - apply_heuristics=apply_heuristics, - noschemas=noschemas) + apply_heuristics=apply_heuristics) else: # No fancy checking for things like .json, etc. for now. Only check things that came from # spreadsheet-like data, where structural datatypes are forced into strings. checked_items = tabbed_rows if not retain_empty_properties: remove_empty_properties(checked_items) - if validate and not noschemas: + if validate: problems = validate_data_against_schemas(checked_items, portal_env=portal_env, portal_vapp=portal_vapp, override_schemas=override_schemas) return checked_items, problems diff --git a/dcicutils/validation_utils.py b/dcicutils/validation_utils.py index e3a0e050c..abe6bcb2e 100644 --- a/dcicutils/validation_utils.py +++ b/dcicutils/validation_utils.py @@ -27,10 +27,10 @@ def fresh_schema_manager_context_for_testing(cls): # finally: # cls.SCHEMA_CACHE = old_schema_cache - def __init__(self, *, override_schemas: Optional[TabbedJsonSchemas] = None, noschemas: bool = False, + def __init__(self, *, override_schemas: Optional[TabbedJsonSchemas] = None, portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None): self.SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it. - if portal_env is None and portal_vapp is None and not noschemas: + if portal_env is None and portal_vapp is None: portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.") self.portal_env = portal_env @@ -64,10 +64,14 @@ def fetch_schema(self, schema_name: str): schema: Optional[AnyJsonData] = self.SCHEMA_CACHE.get(schema_name) if schema is None and schema_name not in self.SCHEMA_CACHE: # If None is already stored, don't look it up again schema_name = schema_name.replace(" ", "") - schema = get_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) + schema = SchemaManager.get_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) self.SCHEMA_CACHE[schema_name] = schema return schema + @staticmethod + def get_schema(name: str, portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None): + return get_schema(name, portal_env=portal_env, portal_vapp=portal_vapp) + # Should not be needed, given that SCHEMA_CACHE is an instance variable. # # @classmethod From 5f432bf42039d195875a8e87d98ed4c501bbb5e3 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 16 Nov 2023 15:10:54 -0500 Subject: [PATCH 21/49] Minor sheet_utils updates related to SMaHT ingestion; handle missing refs. --- dcicutils/bundle_utils.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index 93b08a2e0..577c43dc2 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -150,9 +150,10 @@ class RefHint(TypeHint): def __str__(self): return f"" - def __init__(self, schema_name: str, context: TypeHintContext): + def __init__(self, schema_name: str, required: bool, context: TypeHintContext): self.schema_name = schema_name self.context = context + self.required = required super().__init__() def apply_hint(self, value): @@ -163,6 +164,8 @@ def apply_hint(self, value): return value def _apply_ref_hint(self, value): + if not value and self.required: + raise ValidationProblem(f"Missing required {self.schema_name} reference") if self.is_array: for item in value: if item and not self.context.validate_ref(item_type=self.schema_name, item_ref=item): @@ -364,7 +367,8 @@ def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any cls.set_path_value(datum[key], more_path, value) @classmethod - def find_type_hint_for_subschema(cls, subschema: Any, context: Optional[TypeHintContext] = None): + def find_type_hint_for_subschema(cls, subschema: Any, required: bool = False, + context: Optional[TypeHintContext] = None): if subschema is not None: t = subschema.get('type') if t == 'string': @@ -374,14 +378,14 @@ def find_type_hint_for_subschema(cls, subschema: Any, context: Optional[TypeHint return EnumHint(mapping) link_to = subschema.get('linkTo') if link_to and context.schema_exists(link_to): - return RefHint(schema_name=link_to, context=context) + return RefHint(schema_name=link_to, required=required, context=context) return StringHint() elif t in ('integer', 'number'): return NumHint(declared_type=t) elif t == 'boolean': return BoolHint() elif t == 'array': - array_type_hint = cls.find_type_hint_for_subschema(subschema.get("items"), context) + array_type_hint = cls.find_type_hint_for_subschema(subschema.get("items"), required=required, context=context) if type(array_type_hint) == RefHint: array_type_hint.is_array = True return array_type_hint @@ -399,7 +403,8 @@ def finder(subheader, subschema): if subschema.get('type') == 'object': subsubschema = subschema.get('properties', {}).get(key1) if not other_headers: - hint = cls.find_type_hint_for_subschema(subsubschema, context=context) + required = key1 and subschema and key1 in subschema.get('required', []) + hint = cls.find_type_hint_for_subschema(subsubschema, required=required, context=context) if hint: return hint else: @@ -655,6 +660,9 @@ def check_flattened_row(self, row: Dict, *, tab_name: str, row_number: int, prot patch_item = copy.deepcopy(prototype) for column_number, column_value in enumerate(row.values()): parsed_value = ItemTools.parse_item_value(column_value, apply_heuristics=self.apply_heuristics) + if len(row) > column_number and (list(row.keys())[column_number] or "").endswith("#"): + if isinstance(parsed_value, str): + parsed_value = [value.strip() for value in parsed_value.split(ARRAY_VALUE_DELIMITER) if value] type_hint = type_hints[column_number] if type_hint: try: From 8e2141d5f3c05242aabfa2c101562cc55c629134 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 16 Nov 2023 17:16:18 -0500 Subject: [PATCH 22/49] Minor sheet_utils updates related to SMaHT ingestion; array split related. --- dcicutils/bundle_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index 577c43dc2..d2f3b456e 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -157,9 +157,8 @@ def __init__(self, schema_name: str, required: bool, context: TypeHintContext): super().__init__() def apply_hint(self, value): - if self.is_array: + if self.is_array and isinstance(value, str): value = [value.strip() for value in value.split(ARRAY_VALUE_DELIMITER)] if value else [] - return value self._apply_ref_hint(value) return value From df5890497a4f5e334111d659eb6e43e4f23bed0d Mon Sep 17 00:00:00 2001 From: David Michaels Date: Fri, 17 Nov 2023 12:53:17 -0500 Subject: [PATCH 23/49] flake8 fixes --- dcicutils/bundle_utils.py | 5 +++-- dcicutils/qa_utils.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index d2f3b456e..9f66fb4e0 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -384,8 +384,9 @@ def find_type_hint_for_subschema(cls, subschema: Any, required: bool = False, elif t == 'boolean': return BoolHint() elif t == 'array': - array_type_hint = cls.find_type_hint_for_subschema(subschema.get("items"), required=required, context=context) - if type(array_type_hint) == RefHint: + array_type_hint = cls.find_type_hint_for_subschema(subschema.get("items"), + required=required, context=context) + if type(array_type_hint) is RefHint: array_type_hint.is_array = True return array_type_hint return ArrayHint() diff --git a/dcicutils/qa_utils.py b/dcicutils/qa_utils.py index 0c9b019ed..ac18f3b19 100644 --- a/dcicutils/qa_utils.py +++ b/dcicutils/qa_utils.py @@ -3540,7 +3540,7 @@ def recurse(json1, json2, path=""): if not result: # out(f"Recursive failure at {path!r} in list comparison") pass - elif type(json1) == type(json2): + elif type(json1) is type(json2): result = json1 == json2 if not result: out(f"Failed at {path!r} due to value mismatch: {json.dumps(json1)} != {json.dumps(json2)}") From 74f504b11c5cd8d0dfa297acee77ef00e80cf109 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Fri, 17 Nov 2023 18:21:26 -0500 Subject: [PATCH 24/49] Minor sheet_utils updates related to SMaHT ingestion; handle missing refs. --- dcicutils/bundle_utils.py | 46 +++++++++++++++++++++------------------ test/test_bundle_utils.py | 8 +++++-- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index 9f66fb4e0..de42c4280 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -48,7 +48,7 @@ class TypeHint: def __init__(self): self.is_array = False - def apply_hint(self, value): + def apply_hint(self, value, src): return value def __str__(self): @@ -67,14 +67,14 @@ def __init__(self): TRUE_VALUES = ['true', 't'] FALSE_VALUES = ['false', 'f'] - def apply_hint(self, value): + def apply_hint(self, value, src): if isinstance(value, str) and value: l_value = value.lower() if l_value in self.TRUE_VALUES: return True elif l_value in self.FALSE_VALUES: return False - return super().apply_hint(value) + return super().apply_hint(value, src) class NumHint(TypeHint): @@ -87,13 +87,13 @@ def __init__(self, declared_type: Optional[str] = None): self.preferred_type = self.PREFERENCE_MAP.get(declared_type) super().__init__() - def apply_hint(self, value): + def apply_hint(self, value, src): if isinstance(value, str) and value: if self.preferred_type: return prefer_number(value, kind=self.preferred_type) else: return value - return super().apply_hint(value) + return super().apply_hint(value, src) class EnumHint(TypeHint): @@ -105,7 +105,7 @@ def __init__(self, value_map): self.value_map = value_map super().__init__() - def apply_hint(self, value): + def apply_hint(self, value, src): if isinstance(value, str): if value in self.value_map: result = self.value_map[value] @@ -120,28 +120,28 @@ def apply_hint(self, value): [only_found] = found result = self.value_map[only_found] return result - return super().apply_hint(value) + return super().apply_hint(value, src) class ArrayHint(TypeHint): def __init__(self): super().__init__() - def apply_hint(self, value): + def apply_hint(self, value, src): if value is None or value == '': return [] if isinstance(value, str): if not value: return [] return [value.strip() for value in value.split(ARRAY_VALUE_DELIMITER)] - return super().apply_hint(value) + return super().apply_hint(value, src) class StringHint(TypeHint): def __init__(self): super().__init__() - def apply_hint(self, value): + def apply_hint(self, value, src): return str(value).strip() if value is not None else "" @@ -156,21 +156,23 @@ def __init__(self, schema_name: str, required: bool, context: TypeHintContext): self.required = required super().__init__() - def apply_hint(self, value): + def apply_hint(self, value, src): if self.is_array and isinstance(value, str): value = [value.strip() for value in value.split(ARRAY_VALUE_DELIMITER)] if value else [] - self._apply_ref_hint(value) + if self.is_array and isinstance(value, list): + for item in value: + self._apply_ref_hint(item, src) + else: + self._apply_ref_hint(value, src) return value - def _apply_ref_hint(self, value): + def _apply_ref_hint(self, value, src): if not value and self.required: - raise ValidationProblem(f"Missing required {self.schema_name} reference") - if self.is_array: - for item in value: - if item and not self.context.validate_ref(item_type=self.schema_name, item_ref=item): - raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {item!r}") + raise ValidationProblem(f"No required reference (linkTo) value for: {self.schema_name}" + f"{f' from {src}' if src else ''}") if value and not self.context.validate_ref(item_type=self.schema_name, item_ref=value): - raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {value!r}") + raise ValidationProblem(f"Cannot resolve reference (linkTo) for: {self.schema_name}" + f"{f'/{value}' if value else ''}{f' from {src}' if src else ''}") return value @@ -660,13 +662,15 @@ def check_flattened_row(self, row: Dict, *, tab_name: str, row_number: int, prot patch_item = copy.deepcopy(prototype) for column_number, column_value in enumerate(row.values()): parsed_value = ItemTools.parse_item_value(column_value, apply_heuristics=self.apply_heuristics) - if len(row) > column_number and (list(row.keys())[column_number] or "").endswith("#"): + column_name = (list(row.keys())[column_number] or "") if len(row) > column_number else "" + if column_name.endswith("#"): if isinstance(parsed_value, str): parsed_value = [value.strip() for value in parsed_value.split(ARRAY_VALUE_DELIMITER) if value] type_hint = type_hints[column_number] if type_hint: try: - parsed_value = type_hint.apply_hint(parsed_value) + src = f"{tab_name}{f'.{column_name}' if column_name else ''}" + parsed_value = type_hint.apply_hint(parsed_value, src) except ValidationProblem as e: headers = self.headers_by_tab_name[tab_name] column_name = headers[column_number] diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py index 3edd4acc0..64996e0ae 100644 --- a/test/test_bundle_utils.py +++ b/test/test_bundle_utils.py @@ -723,8 +723,12 @@ def test_table_checker(): portal_env=mock_ff_env) checker.check_tabs() expected_problems = [ - f"User[0].project: Unable to validate Project reference: {SAMPLE_PROJECT_UUID!r}", - f"User[0].user_institution: Unable to validate Institution reference: {SAMPLE_INSTITUTION_UUID!r}" + # f"User[0].project: Unable to validate Project reference: {SAMPLE_PROJECT_UUID!r}", + # f"User[0].user_institution: Unable to validate Institution reference: {SAMPLE_INSTITUTION_UUID!r}" + # "User[0].project: Cannot resolve reference (linkTo) for: Project/dac6d5b3-6ef6-4271-9715-a78329acf846 from User.project", + # "User[0].user_institution: Cannot resolve reference (linkTo) for: Institution/87199845-51b5-4352-bdea-583edae4bb6a from User.user_institution" + f"User[0].project: Cannot resolve reference (linkTo) for: Project/{SAMPLE_PROJECT_UUID} from User.project", + f"User[0].user_institution: Cannot resolve reference (linkTo) for: Institution/{SAMPLE_INSTITUTION_UUID} from User.user_institution" ] expected_problem_lines = [f"Problem: {problem}" for problem in expected_problems] assert exc.value.problems == expected_problems From b5ecf1ab86c035c4058d5f0b182aa6a62180ffac Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 18 Nov 2023 09:29:51 -0500 Subject: [PATCH 25/49] flake8 fixes --- test/test_bundle_utils.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py index 64996e0ae..008442430 100644 --- a/test/test_bundle_utils.py +++ b/test/test_bundle_utils.py @@ -723,12 +723,10 @@ def test_table_checker(): portal_env=mock_ff_env) checker.check_tabs() expected_problems = [ - # f"User[0].project: Unable to validate Project reference: {SAMPLE_PROJECT_UUID!r}", - # f"User[0].user_institution: Unable to validate Institution reference: {SAMPLE_INSTITUTION_UUID!r}" - # "User[0].project: Cannot resolve reference (linkTo) for: Project/dac6d5b3-6ef6-4271-9715-a78329acf846 from User.project", - # "User[0].user_institution: Cannot resolve reference (linkTo) for: Institution/87199845-51b5-4352-bdea-583edae4bb6a from User.user_institution" - f"User[0].project: Cannot resolve reference (linkTo) for: Project/{SAMPLE_PROJECT_UUID} from User.project", - f"User[0].user_institution: Cannot resolve reference (linkTo) for: Institution/{SAMPLE_INSTITUTION_UUID} from User.user_institution" + f"User[0].project: Cannot resolve reference (linkTo) for:" + f" Project/{SAMPLE_PROJECT_UUID} from User.project", + f"User[0].user_institution: Cannot resolve reference (linkTo) for:" + f" Institution/{SAMPLE_INSTITUTION_UUID} from User.user_institution" ] expected_problem_lines = [f"Problem: {problem}" for problem in expected_problems] assert exc.value.problems == expected_problems From 72057e11001bcc6aaf6ff8741691b292b853b324 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 18 Nov 2023 22:59:55 -0500 Subject: [PATCH 26/49] Added split_string to dcicutils. --- dcicutils/misc_utils.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index 97a17c21b..99aa0b4c6 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -23,7 +23,7 @@ from collections import defaultdict from datetime import datetime as datetime_type from dateutil.parser import parse as dateutil_parse -from typing import Optional +from typing import List, Optional # Is this the right place for this? I feel like this should be done in an application, not a library. @@ -1404,6 +1404,30 @@ def string_list(s): return [p for p in [part.strip() for part in s.split(",")] if p] +def split_string(value: str, delimiter: str, escape: Optional[str] = None) -> List[str]: + """ + Splits the given string into an array of string based on the given delimiter, and an optional escape character. + """ + if not isinstance(value, str) or not (value := value.strip()): + return [] + if not isinstance(escape, str) or not escape: + return [item.strip() for item in value.split(delimiter)] + result = [] + item = r"" + escaped = False + for c in value: + if c == delimiter and not escaped: + result.append(item.strip()) + item = r"" + elif c == escape and not escaped: + escaped = True + else: + item += c + escaped = False + result.append(item.strip()) + return [item for item in result if item] + + def is_c4_arn(arn: str) -> bool: """ Returns True iff the given (presumed) AWS ARN string value looks like it From 4b6f527d363ab82bf4cec22107588db6ddda2a08 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 18 Nov 2023 23:07:43 -0500 Subject: [PATCH 27/49] Added split_string to dcicutils. --- test/test_misc_utils.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/test/test_misc_utils.py b/test/test_misc_utils.py index b32a6d4ce..f2c9ebeed 100644 --- a/test/test_misc_utils.py +++ b/test/test_misc_utils.py @@ -31,7 +31,7 @@ ObsoleteError, CycleError, TopologicalSorter, keys_and_values_to_dict, dict_to_keys_and_values, is_c4_arn, deduplicate_list, chunked, parse_in_radix, format_in_radix, managed_property, future_datetime, MIN_DATETIME, MIN_DATETIME_UTC, INPUT, builtin_print, map_chunked, to_camel_case, json_file_contents, - pad_to, JsonLinesReader, + pad_to, JsonLinesReader, split_string ) from dcicutils.qa_utils import ( Occasionally, ControlledTime, override_environ as qa_override_environ, MockFileSystem, printed_output, @@ -3590,3 +3590,22 @@ def test_json_lines_reader_lists(): parsed = [line for line in JsonLinesReader(fp)] expected = [item1, item2] assert parsed == expected + + +def test_split_array_string(): + def split_array_string(value: str) -> List[str]: + return split_string(value, "|", "\\") + assert split_array_string(r"abc|def|ghi") == ["abc", "def", "ghi"] + assert split_array_string(r"abc\|def|ghi") == ["abc|def", "ghi"] + assert split_array_string(r"abc\\|def|ghi") == ["abc\\", "def", "ghi"] + assert split_array_string(r"abc\\\|def|ghi") == ["abc\\|def", "ghi"] + assert split_array_string(r"abc\\\|def\|ghi") == ["abc\\|def|ghi"] + assert split_array_string(r"abc\\|\\def\|ghi") == ["abc\\", "\\def|ghi"] + assert split_array_string(r"abc\\|\\def\|ghi||") == ["abc\\", "\\def|ghi"] + assert split_array_string(r"|abcdefghi|") == ["abcdefghi"] + assert split_array_string(r"|abcdefghi||xyzzy") == ["abcdefghi", "xyzzy"] + assert split_array_string(r"") == [] + assert split_array_string(None) == [] + assert split_array_string(r"|") == [] + assert split_array_string(r"\|") == ["|"] + assert split_array_string(r"\\|") == ["\\"] From 17d1d4d6014ba0503131857f582653bfad4082ef Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 18 Nov 2023 23:14:18 -0500 Subject: [PATCH 28/49] Added merge_objects to misc_utils. --- dcicutils/misc_utils.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index 99aa0b4c6..4582f9c0a 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -23,7 +23,7 @@ from collections import defaultdict from datetime import datetime as datetime_type from dateutil.parser import parse as dateutil_parse -from typing import List, Optional +from typing import Any, List, Optional, Union # Is this the right place for this? I feel like this should be done in an application, not a library. @@ -2074,6 +2074,21 @@ def merge_key_value_dict_lists(x, y): return [key_value_dict(k, v) for k, v in merged.items()] +def merge_objects(target: Union[dict, List[Any]], source: Union[dict, List[Any]]) -> dict: + if isinstance(target, dict) and isinstance(source, dict) and source: + for key, value in source.items(): + target[key] = merge_objects(target[key], value) if key in target else value + elif isinstance(target, list) and isinstance(source, list) and source: + for i in range(max(len(source), len(target))): + if i < len(target): + target[i] = merge_objects(target[i], source[i] if i < len(source) else source[len(source) - 1]) + else: + target.append(source[i]) + elif source: + target = source + return target + + # Stealing topological sort classes below from python's graphlib module introduced # in v3.9 with minor refactoring. # Source: https://github.com/python/cpython/blob/3.11/Lib/graphlib.py From 5aaaaed69f98c899ba501aa8c82538c7a907f6b9 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 18 Nov 2023 23:23:37 -0500 Subject: [PATCH 29/49] Added remove_empty_properties to misc_utils. --- dcicutils/bundle_utils.py | 14 +------------- dcicutils/misc_utils.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index de42c4280..0f9b73e1e 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -4,7 +4,7 @@ from .common import AnyJsonData from .env_utils import EnvUtils, public_env_name from .ff_utils import get_metadata -from .misc_utils import AbstractVirtualApp, ignored, ignorable, PRINT, to_camel_case +from .misc_utils import AbstractVirtualApp, ignored, ignorable, PRINT, remove_empty_properties, to_camel_case from .sheet_utils import ( LoadTableError, prefer_number, TabbedJsonSchemas, Header, Headers, TabbedHeaders, ParsedHeader, ParsedHeaders, TabbedParsedHeaders, SheetCellValue, TabbedSheetData, @@ -752,15 +752,3 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional override_schemas=override_schemas) return checked_items, problems return checked_items - - -def remove_empty_properties(data: Optional[Union[list, dict]]) -> None: - if isinstance(data, dict): - for key in list(data.keys()): - if (value := data[key]) in [None, "", {}, []]: - del data[key] - else: - remove_empty_properties(value) - elif isinstance(data, list): - for item in data: - remove_empty_properties(item) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index 4582f9c0a..40b4d1604 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -1107,6 +1107,18 @@ def remove_suffix(suffix: str, text: str, required: bool = False): return text[:len(text)-len(suffix)] +def remove_empty_properties(data: Optional[Union[list, dict]]) -> None: + if isinstance(data, dict): + for key in list(data.keys()): + if (value := data[key]) in [None, "", {}, []]: + del data[key] + else: + remove_empty_properties(value) + elif isinstance(data, list): + for item in data: + remove_empty_properties(item) + + class ObsoleteError(Exception): pass From 7fd4037e750d712e57778aff802b27a08f8e07c7 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 18 Nov 2023 23:38:42 -0500 Subject: [PATCH 30/49] Added zip_utils. --- dcicutils/zip_utils.py | 78 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 dcicutils/zip_utils.py diff --git a/dcicutils/zip_utils.py b/dcicutils/zip_utils.py new file mode 100644 index 000000000..9324d1e2c --- /dev/null +++ b/dcicutils/zip_utils.py @@ -0,0 +1,78 @@ +from contextlib import contextmanager +import gzip +import os +import shutil +import tarfile +import tempfile +from typing import List, Optional, Union +import zipfile + + +@contextmanager +def unpack_zip_file_to_temporary_directory(file: str) -> str: + with temporary_directory() as tmp_directory_name: + with zipfile.ZipFile(file, "r") as zipf: + zipf.extractall(tmp_directory_name) + yield tmp_directory_name + + +@contextmanager +def unpack_tar_file_to_temporary_directory(file: str) -> str: + with temporary_directory() as tmp_directory_name: + with tarfile.open(file, "r") as tarf: + tarf.extractall(tmp_directory_name) + yield tmp_directory_name + + +def unpack_files(file: str, suffixes: Optional[List[str]] = None) -> Optional[str]: + unpack_file_to_tmp_directory = { + ".tar": unpack_tar_file_to_temporary_directory, + ".zip": unpack_zip_file_to_temporary_directory + }.get(file[dot:]) if (dot := file.rfind(".")) > 0 else None + if unpack_file_to_tmp_directory is not None: + with unpack_file_to_tmp_directory(file) as tmp_directory_name: + for directory, _, files in os.walk(tmp_directory_name): # Ignore "." prefixed files. + for file in [file for file in files if not file.startswith(".")]: + if not suffixes or any(file.endswith(suffix) for suffix in suffixes): + yield os.path.join(directory, file) + + +@contextmanager +def unpack_gz_file_to_temporary_file(file: str, suffix: Optional[str] = None) -> str: + if (gz := file.endswith(".gz")) or file.endswith(".tgz"): # The .tgz suffix is simply short for .tar.gz. + with temporary_file(name=os.path.basename(file[:-3] if gz else file[:-4] + ".tar")) as tmp_file_name: + with open(tmp_file_name, "wb") as outputf: + with gzip.open(file, "rb") as inputf: + outputf.write(inputf.read()) + outputf.close() + yield tmp_file_name + + +@contextmanager +def temporary_directory() -> str: + try: + with tempfile.TemporaryDirectory() as tmp_directory_name: + yield tmp_directory_name + finally: + remove_temporary_directory(tmp_directory_name) + + +@contextmanager +def temporary_file(name: Optional[str] = None, suffix: Optional[str] = None, + content: Optional[Union[str, List[str]]] = None) -> str: + with temporary_directory() as tmp_directory_name: + tmp_file_name = os.path.join(tmp_directory_name, name or tempfile.mktemp(dir="")) + (suffix or "") + with open(tmp_file_name, "w") as tmp_file: + tmp_file.write("\n".join(content) if isinstance(content, list) else str(content)) + yield tmp_file_name + + +def remove_temporary_directory(tmp_directory_name: str) -> None: + def is_temporary_directory(path: str) -> bool: + try: + tmpdir = tempfile.gettempdir() + return os.path.commonpath([path, tmpdir]) == tmpdir and os.path.exists(path) and os.path.isdir(path) + except Exception: + return False + if is_temporary_directory(tmp_directory_name): # Guard against errant deletion. + shutil.rmtree(tmp_directory_name) From 9c5712c60c9b4c4a9ab8cbd680de5a394fc8159e Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 19 Nov 2023 10:56:47 -0500 Subject: [PATCH 31/49] Minor fix to temporary_file in zip_utils. --- dcicutils/zip_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dcicutils/zip_utils.py b/dcicutils/zip_utils.py index 9324d1e2c..56ad9bbbf 100644 --- a/dcicutils/zip_utils.py +++ b/dcicutils/zip_utils.py @@ -62,8 +62,9 @@ def temporary_file(name: Optional[str] = None, suffix: Optional[str] = None, content: Optional[Union[str, List[str]]] = None) -> str: with temporary_directory() as tmp_directory_name: tmp_file_name = os.path.join(tmp_directory_name, name or tempfile.mktemp(dir="")) + (suffix or "") - with open(tmp_file_name, "w") as tmp_file: - tmp_file.write("\n".join(content) if isinstance(content, list) else str(content)) + if content is not None: + with open(tmp_file_name, "wb" if isinstance(content, bytes) else "w") as tmp_file: + tmp_file.write("\n".join(content) if isinstance(content, list) else content) yield tmp_file_name From ff2a6073aaf9969f40fe679cf5dd52349385d531 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 19 Nov 2023 11:14:57 -0500 Subject: [PATCH 32/49] Added right_trim_tuple to misc_utils. --- dcicutils/misc_utils.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index 40b4d1604..f4be0b885 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -23,7 +23,7 @@ from collections import defaultdict from datetime import datetime as datetime_type from dateutil.parser import parse as dateutil_parse -from typing import Any, List, Optional, Union +from typing import Any, List, Optional, Tuple, Union # Is this the right place for this? I feel like this should be done in an application, not a library. @@ -1440,6 +1440,13 @@ def split_string(value: str, delimiter: str, escape: Optional[str] = None) -> Li return [item for item in result if item] +def right_trim_tuple(t: Tuple[Any]) -> Tuple[Any]: + i = len(t) - 1 + while i >= 0 and t[i] is None: + i -= 1 + return t[:i + 1] + + def is_c4_arn(arn: str) -> bool: """ Returns True iff the given (presumed) AWS ARN string value looks like it From bc27b136ac5333abd81721e5ca00ca2d70c3316e Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 19 Nov 2023 12:53:10 -0500 Subject: [PATCH 33/49] Added right_trim_tuple to misc_utils. --- dcicutils/zip_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dcicutils/zip_utils.py b/dcicutils/zip_utils.py index 56ad9bbbf..e98e6ac70 100644 --- a/dcicutils/zip_utils.py +++ b/dcicutils/zip_utils.py @@ -59,7 +59,7 @@ def temporary_directory() -> str: @contextmanager def temporary_file(name: Optional[str] = None, suffix: Optional[str] = None, - content: Optional[Union[str, List[str]]] = None) -> str: + content: Optional[Union[str, bytes, List[str]]] = None) -> str: with temporary_directory() as tmp_directory_name: tmp_file_name = os.path.join(tmp_directory_name, name or tempfile.mktemp(dir="")) + (suffix or "") if content is not None: From 9b0df7af9e6a7a5cb67e199320be57895b489f55 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Mon, 20 Nov 2023 07:42:42 -0500 Subject: [PATCH 34/49] Added right_trim_tuple to misc_utils. --- dcicutils/misc_utils.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index f4be0b885..1893b3d1d 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -1440,11 +1440,15 @@ def split_string(value: str, delimiter: str, escape: Optional[str] = None) -> Li return [item for item in result if item] -def right_trim_tuple(t: Tuple[Any]) -> Tuple[Any]: - i = len(t) - 1 - while i >= 0 and t[i] is None: +def right_trim(list_or_tuple: Union[List[Any], Tuple[Any]]) -> Union[List[Any], Tuple[Any]]: + """ + Removes training None (or emptry string) values from the give tuple or list arnd returns; + does NOT change the given value. + """ + i = len(list_or_tuple) - 1 + while i >= 0 and list_or_tuple[i] in (None, ""): i -= 1 - return t[:i + 1] + return list_or_tuple[:i + 1] def is_c4_arn(arn: str) -> bool: @@ -2093,14 +2097,21 @@ def merge_key_value_dict_lists(x, y): return [key_value_dict(k, v) for k, v in merged.items()] -def merge_objects(target: Union[dict, List[Any]], source: Union[dict, List[Any]]) -> dict: +def merge_objects(target: Union[dict, List[Any]], source: Union[dict, List[Any]], full: bool = False) -> dict: + """ + Merges the given source dictionary or list into the target dictionary or list. + This DOES change the given target (dictionary or list) IN PLACE. + """ if isinstance(target, dict) and isinstance(source, dict) and source: for key, value in source.items(): - target[key] = merge_objects(target[key], value) if key in target else value + target[key] = merge_objects(target[key], value, full) if key in target else value elif isinstance(target, list) and isinstance(source, list) and source: for i in range(max(len(source), len(target))): if i < len(target): - target[i] = merge_objects(target[i], source[i] if i < len(source) else source[len(source) - 1]) + if i < len(source): + target[i] = merge_objects(target[i], source[i], full) + elif full: + target[i] = merge_objects(target[i], source[len(source) - 1], full) else: target.append(source[i]) elif source: From 4d407981605d810ba7fa6e19168c25f5a0fb5dc4 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Mon, 20 Nov 2023 11:38:21 -0500 Subject: [PATCH 35/49] Added to_float and to_integer to misc_utils. --- dcicutils/misc_utils.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index 1893b3d1d..1c8effead 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -978,6 +978,20 @@ def str_to_bool(x: Optional[str]) -> Optional[bool]: raise ValueError(f"An argument to str_or_bool must be a string or None: {x!r}") +def to_integer(value: str, fallback: Optional[Any] = None) -> Optional[Any]: + try: + return int(value) + except Exception: + return fallback + + +def to_float(value: str, fallback: Optional[Any] = None) -> Optional[Any]: + try: + return float(value) + except Exception: + return fallback + + @contextlib.contextmanager def override_environ(**overrides): """ From 471f05b18eecc34e002b5e3d8fd935b534598005 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Mon, 20 Nov 2023 12:21:33 -0500 Subject: [PATCH 36/49] Added to_boolean to misc_utils --- dcicutils/misc_utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index 1c8effead..8076ea40a 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -992,6 +992,15 @@ def to_float(value: str, fallback: Optional[Any] = None) -> Optional[Any]: return fallback +def to_boolean(value: str, fallback: Optional[Any]) -> Optional[Any]: + if isinstance(value, str) and (value := value.strip().lower()): + if (lower_value := value.lower()) in ["true", "t"]: + return True + elif lower_value in ["false", "f"]: + return False + return fallback + + @contextlib.contextmanager def override_environ(**overrides): """ From a7e50d1dade36d0c69d9faf30c50e0a8ab70808e Mon Sep 17 00:00:00 2001 From: David Michaels Date: Mon, 20 Nov 2023 12:35:54 -0500 Subject: [PATCH 37/49] Added to_enum to misc_utils --- dcicutils/misc_utils.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index 8076ea40a..ba10a9ea0 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -1001,6 +1001,20 @@ def to_boolean(value: str, fallback: Optional[Any]) -> Optional[Any]: return fallback +def to_enum(value: str, enumerators: List[str]) -> Optional[str]: + matches = [] + if isinstance(value, str) and (value := value.strip()) and isinstance(enumerators, List): + enum_specifiers = {str(enum).lower(): enum for enum in enumerators} + if (enum_value := enum_specifiers.get(lower_value := value.lower())) is not None: + return enum_value + for enum_canonical, _ in enum_specifiers.items(): + if enum_canonical.startswith(lower_value): + matches.append(enum_canonical) + if len(matches) == 1: + return enum_specifiers[matches[0]] + return enum_specifiers[matches[0]] if len(matches) == 1 else value + + @contextlib.contextmanager def override_environ(**overrides): """ From 8c945ba9bd75654f01dcc19c72dcae4eacc8a964 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Mon, 20 Nov 2023 14:14:52 -0500 Subject: [PATCH 38/49] Updated rst doc file with zip_utils. --- docs/source/dcicutils.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/source/dcicutils.rst b/docs/source/dcicutils.rst index 2d5b8058c..3f10ef95f 100644 --- a/docs/source/dcicutils.rst +++ b/docs/source/dcicutils.rst @@ -335,3 +335,10 @@ variant_utils .. automodule:: dcicutils.variant_utils :members: + + +zip_utils +^^^^^^^^^ + +.. automodule:: dcicutils.zip_utils + :members: From c303955f19e64b052f761b0fe9827465e21b419e Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 22 Nov 2023 10:57:24 -0500 Subject: [PATCH 39/49] Added data_readers. --- dcicutils/data_readers.py | 138 ++++++++++++++++++++++++++++++++++++++ dcicutils/misc_utils.py | 9 +++ 2 files changed, 147 insertions(+) create mode 100644 dcicutils/data_readers.py diff --git a/dcicutils/data_readers.py b/dcicutils/data_readers.py new file mode 100644 index 000000000..5b525d253 --- /dev/null +++ b/dcicutils/data_readers.py @@ -0,0 +1,138 @@ +import abc +import csv +import openpyxl +from typing import Any, Generator, Iterator, List, Optional, Tuple, Union +from dcicutils.misc_utils import right_trim + + +class RowReader(abc.ABC): # These readers may evenutally go into dcicutils. + + def __init__(self): + self.header = None + self.location = 0 + self._warning_empty_headers = False + self._warning_extra_values = [] # Line numbers. + self.open() + + def __iter__(self) -> Iterator: + for row in self.rows: + self.location += 1 + if self.is_terminating_row(row): + break + if len(self.header) < len(row): # Row values beyond what there are headers for are ignored. + self._warning_extra_values.append(self.location) + yield {column: self.cell_value(value) for column, value in zip(self.header, row)} + + def _define_header(self, header: List[Optional[Any]]) -> None: + self.header = [] + for index, column in enumerate(header or []): + if not (column := str(column).strip() if column is not None else ""): + self._warning_empty_headers = True + break # Empty header column signals end of header. + self.header.append(column) + + def rows(self) -> Generator[Union[List[Optional[Any]], Tuple[Optional[Any], ...]], None, None]: + yield + + def is_terminating_row(self, row: Union[List[Optional[Any]], Tuple[Optional[Any]]]) -> bool: + return False + + def cell_value(self, value: Optional[Any]) -> Optional[Any]: + return str(value).strip() if value is not None else "" + + def open(self) -> None: + pass + + @property + def issues(self) -> Optional[List[str]]: + issues = [] + if self._warning_empty_headers: + issues.append("Empty header column encountered; ignoring it and all subsequent columns.") + if self._warning_extra_values: + issues.extend([f"Extra column values on row [{row_number}]" for row_number in self._warning_extra_values]) + return issues if issues else None + + +class ListReader(RowReader): + + def __init__(self, rows: List[List[Optional[Any]]]) -> None: + self._rows = rows + super().__init__() + + @property + def rows(self) -> Generator[List[Optional[Any]], None, None]: + for row in self._rows[1:]: + yield row + + def open(self) -> None: + if not self.header: + self._define_header(self._rows[0] if self._rows else []) + + +class CsvReader(RowReader): + + def __init__(self, file: str) -> None: + self._file = file + self._file_handle = None + self._rows = None + super().__init__() + + @property + def rows(self) -> Generator[List[Optional[Any]], None, None]: + for row in self._rows: + yield right_trim(row) + + def open(self) -> None: + if self._file_handle is None: + self._file_handle = open(self._file) + self._rows = csv.reader(self._file_handle, delimiter="\t" if self._file.endswith(".tsv") else ",") + self._define_header(right_trim(next(self._rows, []))) + + def __del__(self) -> None: + if (file_handle := self._file_handle) is not None: + self._file_handle = None + file_handle.close() + + +class ExcelSheetReader(RowReader): + + def __init__(self, sheet_name: str, workbook: openpyxl.workbook.workbook.Workbook) -> None: + self.sheet_name = sheet_name or "Sheet1" + self._workbook = workbook + self._rows = None + super().__init__() + + @property + def rows(self) -> Generator[Tuple[Optional[Any], ...], None, None]: + for row in self._rows(min_row=2, values_only=True): + yield right_trim(row) + + def is_terminating_row(self, row: Tuple[Optional[Any]]) -> bool: + return all(cell is None for cell in row) # Empty row signals end of data. + + def open(self) -> None: + if not self._rows: + self._rows = self._workbook[self.sheet_name].iter_rows + self._define_header(right_trim(next(self._rows(min_row=1, max_row=1, values_only=True), []))) + + +class Excel: + + def __init__(self, file: str) -> None: + self._file = file + self._workbook = None + self.sheet_names = None + self.open() + + def sheet_reader(self, sheet_name: str) -> ExcelSheetReader: + return ExcelSheetReader(sheet_name=sheet_name, workbook=self._workbook) + + def open(self) -> None: + if self._workbook is None: + self._workbook = openpyxl.load_workbook(self._file, data_only=True) + self.sheet_names = self._workbook.sheetnames or [] + + def __del__(self) -> None: + if (workbook := self._workbook) is not None: + self._workbook = None + workbook.close() diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index ba10a9ea0..83a3094e9 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -1369,6 +1369,15 @@ def json_file_contents(filename): return json.load(fp) +def load_json_if(s: str, is_array: bool = False, is_object: bool = False) -> Optional[Any]: + if ((is_object and s.startswith("{") and s.endswith("}")) or (is_array and s.startswith("[") and s.endswith("]"))): + try: + return json.loads(s) + except Exception: + pass + return s + + def camel_case_to_snake_case(s, separator='_'): """ Converts CamelCase to snake_case. From eb9ccf36071e227e7d3e1fd6ff933efb8611a444 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 22 Nov 2023 11:17:10 -0500 Subject: [PATCH 40/49] Comments --- dcicutils/data_readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dcicutils/data_readers.py b/dcicutils/data_readers.py index 5b525d253..48a0d0c01 100644 --- a/dcicutils/data_readers.py +++ b/dcicutils/data_readers.py @@ -5,7 +5,7 @@ from dcicutils.misc_utils import right_trim -class RowReader(abc.ABC): # These readers may evenutally go into dcicutils. +class RowReader(abc.ABC): def __init__(self): self.header = None From 7037476ea34d96b973a0286faa56db5d50b0b198 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 22 Nov 2023 13:45:24 -0500 Subject: [PATCH 41/49] Minor update to merge_objects. --- dcicutils/misc_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index 83a3094e9..0b93f79c5 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -2146,8 +2146,10 @@ def merge_key_value_dict_lists(x, y): def merge_objects(target: Union[dict, List[Any]], source: Union[dict, List[Any]], full: bool = False) -> dict: """ Merges the given source dictionary or list into the target dictionary or list. - This DOES change the given target (dictionary or list) IN PLACE. + This MAY change the given target (dictionary or list) IN PLACE. """ + if target is None: + return source if isinstance(target, dict) and isinstance(source, dict) and source: for key, value in source.items(): target[key] = merge_objects(target[key], value, full) if key in target else value From 19612b15fb02e8e332a90139777433e3ed032560 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Mon, 27 Nov 2023 11:42:56 -0500 Subject: [PATCH 42/49] Minor updates to Excel class in data_readers. --- dcicutils/data_readers.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/dcicutils/data_readers.py b/dcicutils/data_readers.py index 48a0d0c01..46a07a059 100644 --- a/dcicutils/data_readers.py +++ b/dcicutils/data_readers.py @@ -1,7 +1,7 @@ import abc import csv import openpyxl -from typing import Any, Generator, Iterator, List, Optional, Tuple, Union +from typing import Any, Generator, Iterator, List, Optional, Type, Tuple, Union from dcicutils.misc_utils import right_trim @@ -17,6 +17,8 @@ def __init__(self): def __iter__(self) -> Iterator: for row in self.rows: self.location += 1 + if self.is_comment_row(row): + continue if self.is_terminating_row(row): break if len(self.header) < len(row): # Row values beyond what there are headers for are ignored. @@ -34,6 +36,9 @@ def _define_header(self, header: List[Optional[Any]]) -> None: def rows(self) -> Generator[Union[List[Optional[Any]], Tuple[Optional[Any], ...]], None, None]: yield + def is_comment_row(self, row: Union[List[Optional[Any]], Tuple[Optional[Any]]]) -> bool: + return False + def is_terminating_row(self, row: Union[List[Optional[Any]], Tuple[Optional[Any]]]) -> bool: return False @@ -118,14 +123,18 @@ def open(self) -> None: class Excel: - def __init__(self, file: str) -> None: + def __init__(self, file: str, reader_class: Optional[Type] = None) -> None: self._file = file self._workbook = None self.sheet_names = None + if isinstance(reader_class, Type) and issubclass(reader_class, ExcelSheetReader): + self._reader_class = reader_class + else: + self._reader_class = ExcelSheetReader self.open() def sheet_reader(self, sheet_name: str) -> ExcelSheetReader: - return ExcelSheetReader(sheet_name=sheet_name, workbook=self._workbook) + return self._reader_class(sheet_name=sheet_name, workbook=self._workbook) def open(self) -> None: if self._workbook is None: From cd0f51290a8e096a4d32b1b87171a84aca49aefd Mon Sep 17 00:00:00 2001 From: David Michaels Date: Mon, 27 Nov 2023 11:49:05 -0500 Subject: [PATCH 43/49] Updated dcicutils.rst --- docs/source/dcicutils.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/source/dcicutils.rst b/docs/source/dcicutils.rst index 3f10ef95f..f8b902536 100644 --- a/docs/source/dcicutils.rst +++ b/docs/source/dcicutils.rst @@ -79,6 +79,13 @@ cloudformation_utils :members: +data_readers +^^^^^^^^^^ + +.. automodule:: dcicutils.data_readers + :members: + + data_utils ^^^^^^^^^^ From 843b4e0a60cea54e9d02d1f07813f1a7972ae959 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Mon, 27 Nov 2023 13:23:27 -0500 Subject: [PATCH 44/49] Minor update to misc_utils.load_json_if --- dcicutils/misc_utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index 0b93f79c5..6f8100d51 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -1369,13 +1369,16 @@ def json_file_contents(filename): return json.load(fp) -def load_json_if(s: str, is_array: bool = False, is_object: bool = False) -> Optional[Any]: - if ((is_object and s.startswith("{") and s.endswith("}")) or (is_array and s.startswith("[") and s.endswith("]"))): +def load_json_if(s: str, is_array: bool = False, is_object: bool = False, + fallback: Optional[Any] = None) -> Optional[Any]: + if (isinstance(s, str) and + ((is_object and s.startswith("{") and s.endswith("}")) or + (is_array and s.startswith("[") and s.endswith("]")))): try: return json.loads(s) except Exception: pass - return s + return fallback def camel_case_to_snake_case(s, separator='_'): From 5db09855c5bb8424846918fcb37e6a5579fc1612 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Mon, 27 Nov 2023 15:40:18 -0500 Subject: [PATCH 45/49] Added test for merge_objects --- test/test_misc_utils.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/test/test_misc_utils.py b/test/test_misc_utils.py index f2c9ebeed..52bed8f70 100644 --- a/test/test_misc_utils.py +++ b/test/test_misc_utils.py @@ -31,7 +31,7 @@ ObsoleteError, CycleError, TopologicalSorter, keys_and_values_to_dict, dict_to_keys_and_values, is_c4_arn, deduplicate_list, chunked, parse_in_radix, format_in_radix, managed_property, future_datetime, MIN_DATETIME, MIN_DATETIME_UTC, INPUT, builtin_print, map_chunked, to_camel_case, json_file_contents, - pad_to, JsonLinesReader, split_string + pad_to, JsonLinesReader, split_string, merge_objects ) from dcicutils.qa_utils import ( Occasionally, ControlledTime, override_environ as qa_override_environ, MockFileSystem, printed_output, @@ -3609,3 +3609,19 @@ def split_array_string(value: str) -> List[str]: assert split_array_string(r"|") == [] assert split_array_string(r"\|") == ["|"] assert split_array_string(r"\\|") == ["\\"] + + +def test_merge_objects_1(): + target = {"abc": {"def": {"ghi": None}}, "xyzzy": [None, None, None]} + source = {"xyzzy": [{"foo": None}, {"goo": None}]} + expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}, None]} + merge_objects(target, source, False) + assert target == expected + + +def test_merge_objects_2(): + target = {"abc": {"def": {"ghi": None}}, "xyzzy": [None, None, None]} + source = {"xyzzy": [{"foo": None}, {"goo": None}]} + expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}, {"goo": None}]} + merge_objects(target, source, True) + assert target == expected From 59673ca8d7923fee519dca1d912a8579afb448df Mon Sep 17 00:00:00 2001 From: David Michaels Date: Mon, 27 Nov 2023 15:54:27 -0500 Subject: [PATCH 46/49] Comments and tests for misc_utils.merge_objects. --- dcicutils/misc_utils.py | 4 +++- test/test_misc_utils.py | 47 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index 6f8100d51..e38f980df 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -2149,7 +2149,9 @@ def merge_key_value_dict_lists(x, y): def merge_objects(target: Union[dict, List[Any]], source: Union[dict, List[Any]], full: bool = False) -> dict: """ Merges the given source dictionary or list into the target dictionary or list. - This MAY change the given target (dictionary or list) IN PLACE. + This MAY well change the given target (dictionary or list) IN PLACE. + The the full argument is True then any target lists longer than the + source be will be filled out with the last element(s) of the source. """ if target is None: return source diff --git a/test/test_misc_utils.py b/test/test_misc_utils.py index 52bed8f70..7d737793a 100644 --- a/test/test_misc_utils.py +++ b/test/test_misc_utils.py @@ -3625,3 +3625,50 @@ def test_merge_objects_2(): expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}, {"goo": None}]} merge_objects(target, source, True) assert target == expected + + +def test_merge_objects_3(): + target = {"abc": {"def": {"ghi": None}}, "xyzzy": [None, None]} + source = {"xyzzy": [{"foo": None}, {"goo": None}]} + expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}]} + merge_objects(target, source, False) + assert target == expected + + +def test_merge_objects_4(): + target = {"abc": {"def": {"ghi": None}}, "xyzzy": [None, None]} + source = {"xyzzy": [{"foo": None}, {"goo": None}]} + expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}]} + merge_objects(target, source, True) + assert target == expected + + +def test_merge_objects_5(): + target = {"abc": {"def": {"ghi": None}}, "xyzzy": ["mno"]} + source = {"xyzzy": [{"foo": None}, {"goo": None}]} + expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}]} + merge_objects(target, source, False) + assert target == expected + +def test_merge_objects_6(): + target = {"abc": {"def": {"ghi": None}}, "xyzzy": ["mno"]} + source = {"xyzzy": [{"foo": None}, {"goo": None}]} + expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}]} + merge_objects(target, source, True) + assert target == expected + + +def test_merge_objects_7(): + target = {"abc": {"def": {"ghi": None}}, "xyzzy": [None, None, "abc", "def", 123]} + source = {"xyzzy": [{"foo": None}, {"goo": None}, {"hoo": None}]} + expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}, {"hoo": None}, "def", 123]} + merge_objects(target, source, False) + assert target == expected + + +def test_merge_objects_8(): + target = {"abc": {"def": {"ghi": None}}, "xyzzy": [None, None, "abc", "def", 123]} + source = {"xyzzy": [{"foo": None}, {"goo": None}, {"hoo": None}]} + expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}, {"hoo": None}, {"hoo": None}, {"hoo": None}]} + merge_objects(target, source, True) + assert target == expected From b27428f4edd371e7c50e25f59a267d304f8f9685 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Mon, 27 Nov 2023 15:55:14 -0500 Subject: [PATCH 47/49] lint fixes --- test/test_misc_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_misc_utils.py b/test/test_misc_utils.py index 7d737793a..6bf0afebe 100644 --- a/test/test_misc_utils.py +++ b/test/test_misc_utils.py @@ -3650,6 +3650,7 @@ def test_merge_objects_5(): merge_objects(target, source, False) assert target == expected + def test_merge_objects_6(): target = {"abc": {"def": {"ghi": None}}, "xyzzy": ["mno"]} source = {"xyzzy": [{"foo": None}, {"goo": None}]} @@ -3669,6 +3670,7 @@ def test_merge_objects_7(): def test_merge_objects_8(): target = {"abc": {"def": {"ghi": None}}, "xyzzy": [None, None, "abc", "def", 123]} source = {"xyzzy": [{"foo": None}, {"goo": None}, {"hoo": None}]} - expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}, {"hoo": None}, {"hoo": None}, {"hoo": None}]} + expected = {"abc": {"def": {"ghi": None}}, + "xyzzy": [{"foo": None}, {"goo": None}, {"hoo": None}, {"hoo": None}, {"hoo": None}]} merge_objects(target, source, True) assert target == expected From c0ba3d4ef7eabf889569b4617add4da442cb21d7 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 30 Nov 2023 05:52:01 -0500 Subject: [PATCH 48/49] Minor update to misc_utils.right_trim --- dcicutils/misc_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index e38f980df..0484d6992 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -23,7 +23,7 @@ from collections import defaultdict from datetime import datetime as datetime_type from dateutil.parser import parse as dateutil_parse -from typing import Any, List, Optional, Tuple, Union +from typing import Any, Callable, List, Optional, Tuple, Union # Is this the right place for this? I feel like this should be done in an application, not a library. @@ -1489,13 +1489,14 @@ def split_string(value: str, delimiter: str, escape: Optional[str] = None) -> Li return [item for item in result if item] -def right_trim(list_or_tuple: Union[List[Any], Tuple[Any]]) -> Union[List[Any], Tuple[Any]]: +def right_trim(list_or_tuple: Union[List[Any], Tuple[Any]], + remove: Optional[Callable] = None) -> Union[List[Any], Tuple[Any]]: """ Removes training None (or emptry string) values from the give tuple or list arnd returns; does NOT change the given value. """ i = len(list_or_tuple) - 1 - while i >= 0 and list_or_tuple[i] in (None, ""): + while i >= 0 and ((remove and remove(list_or_tuple[i])) or (not remove and list_or_tuple[i] in (None, ""))): i -= 1 return list_or_tuple[:i + 1] From 35296b76e886fa87439afadef436a60ad009102a Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 30 Nov 2023 06:25:56 -0500 Subject: [PATCH 49/49] Update version and CHANGELOG.rst. --- CHANGELOG.rst | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 63c3ab9c3..317a7391b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,7 +8,7 @@ Change Log 8.4.0 ===== -* More work related to SMaHT ingestion. +* More work related to SMaHT ingestion (bundle/sheet_utils, data_readers, etc). 8.3.0 diff --git a/pyproject.toml b/pyproject.toml index df6ceec2f..4ae62c250 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "8.3.0.1b1" # TODO: To become 8.4.0 +version = "8.4.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT"