From 46a2c0946f4c9ba44fbbd8237a15a32cb77ae47d Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 25 Oct 2023 05:54:37 -0400 Subject: [PATCH] Begin to address David's problems in C4-1111. --- dcicutils/bundle_utils.py | 55 ++++++++++++++++++++++++------ dcicutils/sheet_utils.py | 22 +++++++++++- dcicutils/validation_utils.py | 55 ++++++++++++++++++------------ pyproject.toml | 2 +- test/test_bundle_utils.py | 47 ++++++++++++++++++-------- test/test_validation_utils.py | 63 ++++++++++++++++++++++++++++++++--- 6 files changed, 192 insertions(+), 52 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index 5e37e8660..fce23f71b 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -9,9 +9,9 @@ from .sheet_utils import ( LoadTableError, prefer_number, TabbedJsonSchemas, Header, Headers, TabbedHeaders, ParsedHeader, ParsedHeaders, TabbedParsedHeaders, SheetCellValue, TabbedSheetData, - TableSetManagerRegistry, AbstractTableSetManager, InsertsManager, load_table_set, + TableSetManagerRegistry, AbstractTableSetManager, InsertsManager, TableSetManager, load_table_set, ) -from .validation_utils import SchemaManager +from .validation_utils import SchemaManager, validate_data_against_schemas, summary_of_data_validation_errors PatchPrototype = Dict @@ -378,10 +378,19 @@ def load_table_structures(filename: str, *, apply_heuristics: bool = True, class TableChecker(InflatableTabbedDataManager, TypeHintContext): - def __init__(self, tabbed_sheet_data: TabbedSheetData, schemas: Optional[TabbedJsonSchemas] = None, + def __init__(self, tabbed_sheet_data: TabbedSheetData, *, flattened: bool, + override_schemas: Optional[TabbedJsonSchemas] = None, portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None, apply_heuristics: bool = False): + self.flattened = flattened + if not flattened: + # TODO: Need to implement something that depends on this flattened attribute. + # Also, it's possible that we can default this once we see if the new strategy is general-purpose, + # rather than it being a required argument. But for now let's require it be passed. + # -kmp 25-Oct-2023 + raise ValueError("Only flattened=True is supported by TableChecker for now.") + if portal_env is None and portal_vapp is None: portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) # InflatableTabbedDataManager supplies: @@ -394,7 +403,7 @@ def __init__(self, tabbed_sheet_data: TabbedSheetData, schemas: Optional[TabbedJ self.portal_env = portal_env self.portal_vapp = portal_vapp self.schema_manager: SchemaManager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp, - schemas=schemas) + override_schemas=override_schemas) self.schemas = self.schema_manager.fetch_relevant_schemas(self.tab_names) # , schemas=schemas) self.lookup_tables_by_tab_name: Dict[str, Dict[str, Dict]] = { tab_name: self.build_lookup_table_for_tab(tab_name, rows=rows) @@ -463,6 +472,7 @@ def validate_ref(self, item_type, item_ref): if self.contains_ref(item_type=item_type, item_ref=item_ref): return True try: + # TODO: This probably needs a cache info = get_metadata(f"/{to_camel_case(item_type)}/{item_ref}") # Basically return True if there's a value at all, # but still check it's not an error message that didn't get raised. @@ -499,10 +509,13 @@ def check_row(self, row: Dict, *, tab_name: str, row_number: int, prototype: Dic return patch_item @classmethod - def check(cls, tabbed_sheet_data: TabbedSheetData, schemas: Optional[TabbedJsonSchemas] = None, + def check(cls, tabbed_sheet_data: TabbedSheetData, *, + flattened: bool, + override_schemas: Optional[TabbedJsonSchemas] = None, apply_heuristics: bool = False, portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None): - checker = cls(tabbed_sheet_data, schemas=schemas, apply_heuristics=apply_heuristics, + checker = cls(tabbed_sheet_data, flattened=flattened, + override_schemas=override_schemas, apply_heuristics=apply_heuristics, portal_env=portal_env, portal_vapp=portal_vapp) checked = checker.check_tabs() return checked @@ -538,14 +551,34 @@ def create_tab_processor_state(self, tab_name: str) -> SheetState: def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, - schemas: Optional[TabbedJsonSchemas] = None, apply_heuristics: bool = False, + override_schemas: Optional[TabbedJsonSchemas] = None, apply_heuristics: bool = False, portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None, - validate: bool = False, **kwargs): - tabbed_rows = load_table_set(filename=filename, tab_name=tab_name, escaping=escaping, prefer_number=False, + # TODO: validate= is presently False (i.e., disabled) by default while being debugged, + # but for production use maybe should not be? -kmp 25-Oct-2023 + validate: bool = False, + **kwargs): + annotated_data = TableSetManager.load_annotated(filename=filename, tab_name=tab_name, escaping=escaping, prefer_number=False, **kwargs) - checked_items = check(tabbed_rows, schemas=schemas, portal_env=portal_env, portal_vapp=portal_vapp, - apply_heuristics=apply_heuristics) + tabbed_rows = annotated_data['content'] + flattened = annotated_data['flattened'] + if flattened: + checked_items = TableChecker.check(tabbed_rows, flattened=flattened, + override_schemas=override_schemas, + portal_env=portal_env, portal_vapp=portal_vapp, + apply_heuristics=apply_heuristics) + else: + # No fancy checking for things like .json, etc. for now. Only check things that came from + # spreadsheet-like data, where structural datatypes are forced into strings. + checked_items = tabbed_rows + if validate: + problems = validate_data_against_schemas(checked_items, portal_env=portal_env, portal_vapp=portal_vapp, + override_schemas=override_schemas) + error_summary = summary_of_data_validation_errors(problems) + if error_summary: + for item in error_summary: + print(item) + raise Exception("Validation problems were seen.") # TODO: Maybe connect validation here. Although another option is to just call validation separately # once this is successfully loaded. Needs thought. However, David's validation_utils can do # the validation if we decide to do it, it would just need to be connected up. diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 9242bb765..66ce523fa 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -682,10 +682,30 @@ def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[ """ Given a filename and various options """ + annotated_content = cls.load_annotated(filename=filename, tab_name=tab_name, escaping=escaping, **kwargs) + content: TabbedSheetData = annotated_content['content'] + return content + + @classmethod + def load_annotated(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, + **kwargs) -> Dict: + """ + Given a filename and various options + """ + orig_filename = filename with maybe_unpack(filename) as filename: manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, **kwargs) - return manager.load_content() + content: TabbedSheetData = manager.load_content() + return { + 'filename': filename, + 'content': content, + 'tab_name': tab_name, + 'escaping': escaping, + 'singleton': isinstance(manager, SingleTableMixin), + 'flattened': isinstance(manager, FlattenedTableSetManager), + 'packed': orig_filename != filename, # tar or zip file that had to be unpacked somehow + } load_table_set = TableSetManager.load diff --git a/dcicutils/validation_utils.py b/dcicutils/validation_utils.py index 3e8169000..c4da5a856 100644 --- a/dcicutils/validation_utils.py +++ b/dcicutils/validation_utils.py @@ -8,33 +8,39 @@ from .ff_utils import get_schema from .env_utils import EnvUtils, public_env_name from .lang_utils import there_are, maybe_pluralize, disjoined_list -from .misc_utils import AbstractVirtualApp, PRINT +from .misc_utils import AbstractVirtualApp, PRINT, to_snake_case from .sheet_utils import JsonSchema, TabbedJsonSchemas, SheetData, TabbedSheetData from .task_utils import pmap class SchemaManager: - SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it. - @classmethod @contextlib.contextmanager def fresh_schema_manager_context_for_testing(cls): - old_schema_cache = cls.SCHEMA_CACHE - try: - cls.SCHEMA_CACHE = {} - yield - finally: - cls.SCHEMA_CACHE = old_schema_cache - - def __init__(self, schemas: Optional[TabbedJsonSchemas] = None, + # TODO: Remove references to this once reimplementation using an instance variable for SCHEMA_CACHE is working. + yield + # old_schema_cache = cls.SCHEMA_CACHE + # try: + # cls.SCHEMA_CACHE = {} + # yield + # finally: + # cls.SCHEMA_CACHE = old_schema_cache + + def __init__(self, *, override_schemas: Optional[TabbedJsonSchemas] = None, portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None): + self.SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it. if portal_env is None and portal_vapp is None: portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.") self.portal_env = portal_env self.portal_vapp = portal_vapp - self.schemas = {} if schemas is None else schemas.copy() + self.override_schemas = ( + {} + if override_schemas is None + else {to_snake_case(key): value # important to both canonicalize the case and copy the dict + for key, value in override_schemas.items()} + ) def fetch_relevant_schemas(self, schema_names: List[str]): # , schemas: Optional[TabbedSchemas] = None): # if schemas is None: @@ -51,7 +57,8 @@ def schema_exists(self, schema_name: str): return bool(self.fetch_schema(schema_name=schema_name)) def fetch_schema(self, schema_name: str): - override_schema = self.schemas.get(schema_name) + schema_name = to_snake_case(schema_name) + override_schema = self.override_schemas.get(schema_name) if override_schema is not None: return override_schema schema: Optional[AnyJsonData] = self.SCHEMA_CACHE.get(schema_name) @@ -60,10 +67,12 @@ def fetch_schema(self, schema_name: str): self.SCHEMA_CACHE[schema_name] = schema return schema - @classmethod - def clear_schema_cache(cls): - for key in list(cls.SCHEMA_CACHE.keys()): # important to get the list of keys as a separate object first - cls.SCHEMA_CACHE.pop(key, None) + # Should not be needed given SCHEMA_CACHE is an instance variable. + # + # @classmethod + # def clear_schema_cache(cls): + # for key in list(cls.SCHEMA_CACHE.keys()): # important to get the list of keys as a separate object first + # cls.SCHEMA_CACHE.pop(key, None) def identifying_properties(self, schema: Optional[JsonSchema] = None, schema_name: Optional[str] = None, among: Optional[List[str]] = None): @@ -76,7 +85,7 @@ def identifying_properties(self, schema: Optional[JsonSchema] = None, schema_nam if prop in possible_identifying_properties)) return identifying_properties - @classmethod + @classmethod # This operation doesn't actually use the schemas so is safe as a class method def identifying_value(cls, data_item: Dict[str, AnyJsonData], identifying_properties) -> AnyJsonData: if not identifying_properties: raise ValueError("No identifying properties were specified.") @@ -89,9 +98,10 @@ def identifying_value(cls, data_item: Dict[str, AnyJsonData], identifying_proper f' in {json.dumps(data_item)}.') -def validate_data_against_schemas(data: TabbedSheetData, +def validate_data_against_schemas(data: TabbedSheetData, *, + portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None, - schemas: Optional[TabbedJsonSchemas] = None) -> Optional[Dict]: + override_schemas: Optional[TabbedJsonSchemas] = None) -> Optional[Dict]: """ Validates the given data against the corresponding schema(s). The given data is assumed to be in a format as returned by sheet_utils, i.e. a dictionary of lists of objects where each @@ -148,7 +158,7 @@ def validate_data_against_schemas(data: TabbedSheetData, the given data, which can be useful in identifying the object in the source data if it is unidentified. """ - schema_manager = SchemaManager(portal_vapp=portal_vapp, schemas=schemas) + schema_manager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp, override_schemas=override_schemas) errors = [] schemas = schema_manager.fetch_relevant_schemas(list(data.keys())) @@ -156,7 +166,8 @@ def validate_data_against_schemas(data: TabbedSheetData, for data_type in data: schema = schemas.get(data_type) if not schema: - errors.append({"error": f"No schema found for: {data_type}"}) + if schema is None: # if Schema is {}, we're deliberately suppressing schema checking (not an error) + errors.append({"error": f"No schema found for: {data_type}"}) continue data_errors = validate_data_items_against_schemas(data[data_type], data_type, schema) errors.extend(data_errors) diff --git a/pyproject.toml b/pyproject.toml index 7e1520b69..0d661cd71 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "8.0.0.1-alpha.13" # to become "8.1.0" +version = "8.0.0.1-alpha.14" # to become "8.1.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py index ce716878f..8b5777404 100644 --- a/test/test_bundle_utils.py +++ b/test/test_bundle_utils.py @@ -218,10 +218,19 @@ def test_load_table_structures(): assert str(exc.value) == "Unknown file type: something.else" -def test_load_items(): +@contextlib.contextmanager +def no_schemas(): with mock.patch.object(validation_utils_module, "get_schema") as mock_get_schema: mock_get_schema.return_value = {} + yield + + +def test_load_items(): + + # with mock.patch.object(validation_utils_module, "get_schema") as mock_get_schema: + # mock_get_schema.return_value = {} + with no_schemas(): assert load_items(SAMPLE_XLSX_FILE, apply_heuristics=True) == SAMPLE_XLSX_FILE_ITEM_CONTENT assert load_items(SAMPLE_CSV_FILE, apply_heuristics=True) == SAMPLE_CSV_FILE_ITEM_CONTENT @@ -385,29 +394,35 @@ def test_load_items_with_schema(): print("Case 2") file_base_name = os.path.splitext(os.path.basename(SAMPLE_CSV_FILE2))[0] expected = SAMPLE_CSV_FILE2_ITEM_CONTENT - actual = load_items(SAMPLE_CSV_FILE2, schemas={file_base_name: {}}, apply_heuristics=True) + actual = load_items(SAMPLE_CSV_FILE2, override_schemas={file_base_name: {}}, apply_heuristics=True) assert actual == expected print("Case 3") expected = SAMPLE_CSV_FILE2_PERSON_CONTENT_HINTED - actual = load_items(SAMPLE_CSV_FILE2, schemas=SAMPLE_CSV_FILE2_SCHEMAS, tab_name='Person') + actual = load_items(SAMPLE_CSV_FILE2, override_schemas=SAMPLE_CSV_FILE2_SCHEMAS, tab_name='Person') assert actual == expected def test_sample_items_csv_vs_json(): - csv_content = load_items(SAMPLE_CSV_FILE2, tab_name='Person', schemas=SAMPLE_CSV_FILE2_SCHEMAS) + csv_content = load_items(SAMPLE_CSV_FILE2, tab_name='Person', override_schemas=SAMPLE_CSV_FILE2_SCHEMAS) - json_content = load_items(SAMPLE_JSON_FILE2, tab_name="Person", schemas=SAMPLE_CSV_FILE2_SCHEMAS) + json_content = load_items(SAMPLE_JSON_FILE2, tab_name="Person", override_schemas=SAMPLE_CSV_FILE2_SCHEMAS) assert csv_content == json_content def test_sample_items_json_vs_yaml(): - tabs_data_from_json = load_items(SAMPLE_JSON_TABS_FILE) - tabs_data_from_yaml = load_items(SAMPLE_YAML_TABS_FILE) - assert tabs_data_from_json == tabs_data_from_yaml + with SchemaManager.fresh_schema_manager_context_for_testing(): + + # with mock.patch.object(validation_utils_module, "get_schema") as mock_get_schema: + # mock_get_schema.return_value = {} # no schema checking + with no_schemas(): + + tabs_data_from_json = load_items(SAMPLE_JSON_TABS_FILE) + tabs_data_from_yaml = load_items(SAMPLE_YAML_TABS_FILE) + assert tabs_data_from_json == tabs_data_from_yaml @using_fresh_ff_state_for_testing() @@ -421,7 +436,7 @@ def test_schema_autoload_mixin_caching(portal_env): assert schema_manager.portal_env == 'data' # it should have defaulted even if we didn't supply it - assert SchemaManager.SCHEMA_CACHE == {} + assert schema_manager.SCHEMA_CACHE == {} sample_schema_name = 'foo' sample_schema = {'mock_schema_for': 'foo'} @@ -431,7 +446,7 @@ def test_schema_autoload_mixin_caching(portal_env): assert schema_manager.fetch_schema(sample_schema_name) == sample_schema schema_cache_with_sample_schema = {sample_schema_name: sample_schema} - assert SchemaManager.SCHEMA_CACHE == schema_cache_with_sample_schema + assert schema_manager.SCHEMA_CACHE == schema_cache_with_sample_schema @using_fresh_ff_state_for_testing() @@ -639,7 +654,9 @@ def test_table_checker(): with printed_output() as printed: with pytest.raises(Exception) as exc: - checker = TableChecker(SAMPLE_WORKBOOK_WITH_UNMATCHED_UUID_REFS, portal_env=mock_ff_env) + checker = TableChecker(SAMPLE_WORKBOOK_WITH_UNMATCHED_UUID_REFS, + flattened=True, + portal_env=mock_ff_env) checker.check_tabs() assert str(exc.value) == "There were 2 problems while compiling hints." assert printed.lines == [ @@ -648,8 +665,12 @@ def test_table_checker(): f" {SAMPLE_INSTITUTION_UUID!r}") ] - checker = TableChecker(SAMPLE_WORKBOOK_WITH_MATCHED_UUID_REFS, portal_env=mock_ff_env) + checker = TableChecker(SAMPLE_WORKBOOK_WITH_MATCHED_UUID_REFS, + flattened=True, + portal_env=mock_ff_env) checker.check_tabs() - checker = TableChecker(SAMPLE_WORKBOOK_WITH_NAME_REFS, portal_env=mock_ff_env) + checker = TableChecker(SAMPLE_WORKBOOK_WITH_NAME_REFS, + flattened=True, + portal_env=mock_ff_env) checker.check_tabs() diff --git a/test/test_validation_utils.py b/test/test_validation_utils.py index ce1730d1e..d0039a957 100644 --- a/test/test_validation_utils.py +++ b/test/test_validation_utils.py @@ -4,13 +4,68 @@ import re from dcicutils.bundle_utils import inflate -from dcicutils.misc_utils import AbstractVirtualApp, NamedObject, json_file_contents, to_snake_case -from dcicutils.qa_utils import MockResponse +from dcicutils.misc_utils import AbstractVirtualApp, NamedObject, json_file_contents, to_snake_case, to_camel_case +from dcicutils.qa_utils import MockResponse, printed_output from dcicutils.validation_utils import SchemaManager, validate_data_against_schemas, summary_of_data_validation_errors from .conftest_settings import TEST_DIR from .helpers_for_bundles import SAMPLE_WORKBOOK_WITH_NAME_REFS +def test_schema_manager_simple(): + + print() # start on a fresh line + + with printed_output() as printed: + + schema_manager_1 = SchemaManager() + + assert printed.lines == [ + "The portal_env was not explicitly supplied. Schemas will come from portal_env='data'." + ] + + assert schema_manager_1.SCHEMA_CACHE == {} + + # Test that schema-lookup works, since that's kinda what these are about + user_schema = schema_manager_1.fetch_schema('user') + assert isinstance(user_schema, dict) + assert user_schema.get('title') == 'User' + + assert schema_manager_1.override_schemas == {} + + +@pytest.mark.parametrize('schema_id', ['user', 'User']) +def test_schema_manager_with_schemas(schema_id): + + print() # start on a fresh line + + with printed_output() as printed: + + schemas = {schema_id: {}} + snake_id = to_snake_case(schema_id) + camel_id = to_camel_case(schema_id) + + # Just to make sure to_snake_case and to_camel_case aren't doing something weird + assert schema_id == snake_id or schema_id == camel_id + + schema_manager_2 = SchemaManager(override_schemas=schemas) + + # whether 'User' or 'user' was an input, it will be canonicalized to snake case + assert schema_manager_2.override_schemas == {snake_id: {}} + + assert printed.lines == [ + "The portal_env was not explicitly supplied. Schemas will come from portal_env='data'." + ] + + assert schema_manager_2.fetch_schema(snake_id) == {} + assert schema_manager_2.fetch_schema(camel_id) == {} + + # even after using a camel case id, only the snake_id will be in the table + assert schema_manager_2.override_schemas == {snake_id: {}} + + # this would only get updated if we fetched something remotely + assert schema_manager_2.SCHEMA_CACHE == {} + + def test_schema_manager_identifying_value(): with pytest.raises(ValueError) as exc: @@ -54,14 +109,14 @@ def get(cls, path_url): good_workbook = inflate(SAMPLE_WORKBOOK_WITH_NAME_REFS) - assert validate_data_against_schemas(good_workbook, portal_vapp) is None + assert validate_data_against_schemas(good_workbook, portal_vapp=portal_vapp) is None bogus_workbook = copy.deepcopy(good_workbook) # modified immediately below user_items = bogus_workbook['User'] user_item0 = user_items[0] user_item0['bogus'] = 'item' - assert validate_data_against_schemas(bogus_workbook, portal_vapp) == { + assert validate_data_against_schemas(bogus_workbook, portal_vapp=portal_vapp) == { 'errors': [ { 'extraneous_properties': ['bogus'],