Skip to content

Commit

Permalink
Begin to address David's problems in C4-1111.
Browse files Browse the repository at this point in the history
  • Loading branch information
netsettler committed Oct 25, 2023
1 parent 68d9459 commit 46a2c09
Show file tree
Hide file tree
Showing 6 changed files with 192 additions and 52 deletions.
55 changes: 44 additions & 11 deletions dcicutils/bundle_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
from .sheet_utils import (
LoadTableError, prefer_number, TabbedJsonSchemas,
Header, Headers, TabbedHeaders, ParsedHeader, ParsedHeaders, TabbedParsedHeaders, SheetCellValue, TabbedSheetData,
TableSetManagerRegistry, AbstractTableSetManager, InsertsManager, load_table_set,
TableSetManagerRegistry, AbstractTableSetManager, InsertsManager, TableSetManager, load_table_set,
)
from .validation_utils import SchemaManager
from .validation_utils import SchemaManager, validate_data_against_schemas, summary_of_data_validation_errors


PatchPrototype = Dict
Expand Down Expand Up @@ -378,10 +378,19 @@ def load_table_structures(filename: str, *, apply_heuristics: bool = True,

class TableChecker(InflatableTabbedDataManager, TypeHintContext):

def __init__(self, tabbed_sheet_data: TabbedSheetData, schemas: Optional[TabbedJsonSchemas] = None,
def __init__(self, tabbed_sheet_data: TabbedSheetData, *, flattened: bool,
override_schemas: Optional[TabbedJsonSchemas] = None,
portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None,
apply_heuristics: bool = False):

self.flattened = flattened
if not flattened:
# TODO: Need to implement something that depends on this flattened attribute.
# Also, it's possible that we can default this once we see if the new strategy is general-purpose,
# rather than it being a required argument. But for now let's require it be passed.
# -kmp 25-Oct-2023
raise ValueError("Only flattened=True is supported by TableChecker for now.")

if portal_env is None and portal_vapp is None:
portal_env = public_env_name(EnvUtils.PRD_ENV_NAME)
# InflatableTabbedDataManager supplies:
Expand All @@ -394,7 +403,7 @@ def __init__(self, tabbed_sheet_data: TabbedSheetData, schemas: Optional[TabbedJ
self.portal_env = portal_env
self.portal_vapp = portal_vapp
self.schema_manager: SchemaManager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp,
schemas=schemas)
override_schemas=override_schemas)
self.schemas = self.schema_manager.fetch_relevant_schemas(self.tab_names) # , schemas=schemas)
self.lookup_tables_by_tab_name: Dict[str, Dict[str, Dict]] = {
tab_name: self.build_lookup_table_for_tab(tab_name, rows=rows)
Expand Down Expand Up @@ -463,6 +472,7 @@ def validate_ref(self, item_type, item_ref):
if self.contains_ref(item_type=item_type, item_ref=item_ref):
return True
try:
# TODO: This probably needs a cache
info = get_metadata(f"/{to_camel_case(item_type)}/{item_ref}")
# Basically return True if there's a value at all,
# but still check it's not an error message that didn't get raised.
Expand Down Expand Up @@ -499,10 +509,13 @@ def check_row(self, row: Dict, *, tab_name: str, row_number: int, prototype: Dic
return patch_item

@classmethod
def check(cls, tabbed_sheet_data: TabbedSheetData, schemas: Optional[TabbedJsonSchemas] = None,
def check(cls, tabbed_sheet_data: TabbedSheetData, *,
flattened: bool,
override_schemas: Optional[TabbedJsonSchemas] = None,
apply_heuristics: bool = False,
portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None):
checker = cls(tabbed_sheet_data, schemas=schemas, apply_heuristics=apply_heuristics,
checker = cls(tabbed_sheet_data, flattened=flattened,
override_schemas=override_schemas, apply_heuristics=apply_heuristics,
portal_env=portal_env, portal_vapp=portal_vapp)
checked = checker.check_tabs()
return checked
Expand Down Expand Up @@ -538,14 +551,34 @@ def create_tab_processor_state(self, tab_name: str) -> SheetState:


def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None,
schemas: Optional[TabbedJsonSchemas] = None, apply_heuristics: bool = False,
override_schemas: Optional[TabbedJsonSchemas] = None, apply_heuristics: bool = False,
portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None,
validate: bool = False, **kwargs):
tabbed_rows = load_table_set(filename=filename, tab_name=tab_name, escaping=escaping, prefer_number=False,
# TODO: validate= is presently False (i.e., disabled) by default while being debugged,
# but for production use maybe should not be? -kmp 25-Oct-2023
validate: bool = False,
**kwargs):
annotated_data = TableSetManager.load_annotated(filename=filename, tab_name=tab_name, escaping=escaping, prefer_number=False,
**kwargs)
checked_items = check(tabbed_rows, schemas=schemas, portal_env=portal_env, portal_vapp=portal_vapp,
apply_heuristics=apply_heuristics)
tabbed_rows = annotated_data['content']
flattened = annotated_data['flattened']
if flattened:
checked_items = TableChecker.check(tabbed_rows, flattened=flattened,
override_schemas=override_schemas,
portal_env=portal_env, portal_vapp=portal_vapp,
apply_heuristics=apply_heuristics)
else:
# No fancy checking for things like .json, etc. for now. Only check things that came from
# spreadsheet-like data, where structural datatypes are forced into strings.
checked_items = tabbed_rows

if validate:
problems = validate_data_against_schemas(checked_items, portal_env=portal_env, portal_vapp=portal_vapp,
override_schemas=override_schemas)
error_summary = summary_of_data_validation_errors(problems)
if error_summary:
for item in error_summary:
print(item)
raise Exception("Validation problems were seen.")
# TODO: Maybe connect validation here. Although another option is to just call validation separately
# once this is successfully loaded. Needs thought. However, David's validation_utils can do
# the validation if we decide to do it, it would just need to be connected up.
Expand Down
22 changes: 21 additions & 1 deletion dcicutils/sheet_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -682,10 +682,30 @@ def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[
"""
Given a filename and various options
"""
annotated_content = cls.load_annotated(filename=filename, tab_name=tab_name, escaping=escaping, **kwargs)
content: TabbedSheetData = annotated_content['content']
return content

@classmethod
def load_annotated(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None,
**kwargs) -> Dict:
"""
Given a filename and various options
"""
orig_filename = filename
with maybe_unpack(filename) as filename:
manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping,
**kwargs)
return manager.load_content()
content: TabbedSheetData = manager.load_content()
return {
'filename': filename,
'content': content,
'tab_name': tab_name,
'escaping': escaping,
'singleton': isinstance(manager, SingleTableMixin),
'flattened': isinstance(manager, FlattenedTableSetManager),
'packed': orig_filename != filename, # tar or zip file that had to be unpacked somehow
}


load_table_set = TableSetManager.load
55 changes: 33 additions & 22 deletions dcicutils/validation_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,33 +8,39 @@
from .ff_utils import get_schema
from .env_utils import EnvUtils, public_env_name
from .lang_utils import there_are, maybe_pluralize, disjoined_list
from .misc_utils import AbstractVirtualApp, PRINT
from .misc_utils import AbstractVirtualApp, PRINT, to_snake_case
from .sheet_utils import JsonSchema, TabbedJsonSchemas, SheetData, TabbedSheetData
from .task_utils import pmap


class SchemaManager:

SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it.

@classmethod
@contextlib.contextmanager
def fresh_schema_manager_context_for_testing(cls):
old_schema_cache = cls.SCHEMA_CACHE
try:
cls.SCHEMA_CACHE = {}
yield
finally:
cls.SCHEMA_CACHE = old_schema_cache

def __init__(self, schemas: Optional[TabbedJsonSchemas] = None,
# TODO: Remove references to this once reimplementation using an instance variable for SCHEMA_CACHE is working.
yield
# old_schema_cache = cls.SCHEMA_CACHE
# try:
# cls.SCHEMA_CACHE = {}
# yield
# finally:
# cls.SCHEMA_CACHE = old_schema_cache

def __init__(self, *, override_schemas: Optional[TabbedJsonSchemas] = None,
portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None):
self.SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it.
if portal_env is None and portal_vapp is None:
portal_env = public_env_name(EnvUtils.PRD_ENV_NAME)
PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.")
self.portal_env = portal_env
self.portal_vapp = portal_vapp
self.schemas = {} if schemas is None else schemas.copy()
self.override_schemas = (
{}
if override_schemas is None
else {to_snake_case(key): value # important to both canonicalize the case and copy the dict
for key, value in override_schemas.items()}
)

def fetch_relevant_schemas(self, schema_names: List[str]): # , schemas: Optional[TabbedSchemas] = None):
# if schemas is None:
Expand All @@ -51,7 +57,8 @@ def schema_exists(self, schema_name: str):
return bool(self.fetch_schema(schema_name=schema_name))

def fetch_schema(self, schema_name: str):
override_schema = self.schemas.get(schema_name)
schema_name = to_snake_case(schema_name)
override_schema = self.override_schemas.get(schema_name)
if override_schema is not None:
return override_schema
schema: Optional[AnyJsonData] = self.SCHEMA_CACHE.get(schema_name)
Expand All @@ -60,10 +67,12 @@ def fetch_schema(self, schema_name: str):
self.SCHEMA_CACHE[schema_name] = schema
return schema

@classmethod
def clear_schema_cache(cls):
for key in list(cls.SCHEMA_CACHE.keys()): # important to get the list of keys as a separate object first
cls.SCHEMA_CACHE.pop(key, None)
# Should not be needed given SCHEMA_CACHE is an instance variable.
#
# @classmethod
# def clear_schema_cache(cls):
# for key in list(cls.SCHEMA_CACHE.keys()): # important to get the list of keys as a separate object first
# cls.SCHEMA_CACHE.pop(key, None)

def identifying_properties(self, schema: Optional[JsonSchema] = None, schema_name: Optional[str] = None,
among: Optional[List[str]] = None):
Expand All @@ -76,7 +85,7 @@ def identifying_properties(self, schema: Optional[JsonSchema] = None, schema_nam
if prop in possible_identifying_properties))
return identifying_properties

@classmethod
@classmethod # This operation doesn't actually use the schemas so is safe as a class method
def identifying_value(cls, data_item: Dict[str, AnyJsonData], identifying_properties) -> AnyJsonData:
if not identifying_properties:
raise ValueError("No identifying properties were specified.")
Expand All @@ -89,9 +98,10 @@ def identifying_value(cls, data_item: Dict[str, AnyJsonData], identifying_proper
f' in {json.dumps(data_item)}.')


def validate_data_against_schemas(data: TabbedSheetData,
def validate_data_against_schemas(data: TabbedSheetData, *,
portal_env: Optional[str] = None,
portal_vapp: Optional[AbstractVirtualApp] = None,
schemas: Optional[TabbedJsonSchemas] = None) -> Optional[Dict]:
override_schemas: Optional[TabbedJsonSchemas] = None) -> Optional[Dict]:
"""
Validates the given data against the corresponding schema(s). The given data is assumed to
be in a format as returned by sheet_utils, i.e. a dictionary of lists of objects where each
Expand Down Expand Up @@ -148,15 +158,16 @@ def validate_data_against_schemas(data: TabbedSheetData,
the given data, which can be useful in identifying the object in the source data if it is unidentified.
"""

schema_manager = SchemaManager(portal_vapp=portal_vapp, schemas=schemas)
schema_manager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp, override_schemas=override_schemas)

errors = []
schemas = schema_manager.fetch_relevant_schemas(list(data.keys()))

for data_type in data:
schema = schemas.get(data_type)
if not schema:
errors.append({"error": f"No schema found for: {data_type}"})
if schema is None: # if Schema is {}, we're deliberately suppressing schema checking (not an error)
errors.append({"error": f"No schema found for: {data_type}"})
continue
data_errors = validate_data_items_against_schemas(data[data_type], data_type, schema)
errors.extend(data_errors)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dcicutils"
version = "8.0.0.1-alpha.13" # to become "8.1.0"
version = "8.0.0.1-alpha.14" # to become "8.1.0"
description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources"
authors = ["4DN-DCIC Team <support@4dnucleome.org>"]
license = "MIT"
Expand Down
47 changes: 34 additions & 13 deletions test/test_bundle_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,10 +218,19 @@ def test_load_table_structures():
assert str(exc.value) == "Unknown file type: something.else"


def test_load_items():
@contextlib.contextmanager
def no_schemas():

with mock.patch.object(validation_utils_module, "get_schema") as mock_get_schema:
mock_get_schema.return_value = {}
yield


def test_load_items():

# with mock.patch.object(validation_utils_module, "get_schema") as mock_get_schema:
# mock_get_schema.return_value = {}
with no_schemas():

assert load_items(SAMPLE_XLSX_FILE, apply_heuristics=True) == SAMPLE_XLSX_FILE_ITEM_CONTENT
assert load_items(SAMPLE_CSV_FILE, apply_heuristics=True) == SAMPLE_CSV_FILE_ITEM_CONTENT
Expand Down Expand Up @@ -385,29 +394,35 @@ def test_load_items_with_schema():
print("Case 2")
file_base_name = os.path.splitext(os.path.basename(SAMPLE_CSV_FILE2))[0]
expected = SAMPLE_CSV_FILE2_ITEM_CONTENT
actual = load_items(SAMPLE_CSV_FILE2, schemas={file_base_name: {}}, apply_heuristics=True)
actual = load_items(SAMPLE_CSV_FILE2, override_schemas={file_base_name: {}}, apply_heuristics=True)
assert actual == expected

print("Case 3")
expected = SAMPLE_CSV_FILE2_PERSON_CONTENT_HINTED
actual = load_items(SAMPLE_CSV_FILE2, schemas=SAMPLE_CSV_FILE2_SCHEMAS, tab_name='Person')
actual = load_items(SAMPLE_CSV_FILE2, override_schemas=SAMPLE_CSV_FILE2_SCHEMAS, tab_name='Person')
assert actual == expected


def test_sample_items_csv_vs_json():

csv_content = load_items(SAMPLE_CSV_FILE2, tab_name='Person', schemas=SAMPLE_CSV_FILE2_SCHEMAS)
csv_content = load_items(SAMPLE_CSV_FILE2, tab_name='Person', override_schemas=SAMPLE_CSV_FILE2_SCHEMAS)

json_content = load_items(SAMPLE_JSON_FILE2, tab_name="Person", schemas=SAMPLE_CSV_FILE2_SCHEMAS)
json_content = load_items(SAMPLE_JSON_FILE2, tab_name="Person", override_schemas=SAMPLE_CSV_FILE2_SCHEMAS)

assert csv_content == json_content


def test_sample_items_json_vs_yaml():

tabs_data_from_json = load_items(SAMPLE_JSON_TABS_FILE)
tabs_data_from_yaml = load_items(SAMPLE_YAML_TABS_FILE)
assert tabs_data_from_json == tabs_data_from_yaml
with SchemaManager.fresh_schema_manager_context_for_testing():

# with mock.patch.object(validation_utils_module, "get_schema") as mock_get_schema:
# mock_get_schema.return_value = {} # no schema checking
with no_schemas():

tabs_data_from_json = load_items(SAMPLE_JSON_TABS_FILE)
tabs_data_from_yaml = load_items(SAMPLE_YAML_TABS_FILE)
assert tabs_data_from_json == tabs_data_from_yaml


@using_fresh_ff_state_for_testing()
Expand All @@ -421,7 +436,7 @@ def test_schema_autoload_mixin_caching(portal_env):

assert schema_manager.portal_env == 'data' # it should have defaulted even if we didn't supply it

assert SchemaManager.SCHEMA_CACHE == {}
assert schema_manager.SCHEMA_CACHE == {}

sample_schema_name = 'foo'
sample_schema = {'mock_schema_for': 'foo'}
Expand All @@ -431,7 +446,7 @@ def test_schema_autoload_mixin_caching(portal_env):
assert schema_manager.fetch_schema(sample_schema_name) == sample_schema

schema_cache_with_sample_schema = {sample_schema_name: sample_schema}
assert SchemaManager.SCHEMA_CACHE == schema_cache_with_sample_schema
assert schema_manager.SCHEMA_CACHE == schema_cache_with_sample_schema


@using_fresh_ff_state_for_testing()
Expand Down Expand Up @@ -639,7 +654,9 @@ def test_table_checker():

with printed_output() as printed:
with pytest.raises(Exception) as exc:
checker = TableChecker(SAMPLE_WORKBOOK_WITH_UNMATCHED_UUID_REFS, portal_env=mock_ff_env)
checker = TableChecker(SAMPLE_WORKBOOK_WITH_UNMATCHED_UUID_REFS,
flattened=True,
portal_env=mock_ff_env)
checker.check_tabs()
assert str(exc.value) == "There were 2 problems while compiling hints."
assert printed.lines == [
Expand All @@ -648,8 +665,12 @@ def test_table_checker():
f" {SAMPLE_INSTITUTION_UUID!r}")
]

checker = TableChecker(SAMPLE_WORKBOOK_WITH_MATCHED_UUID_REFS, portal_env=mock_ff_env)
checker = TableChecker(SAMPLE_WORKBOOK_WITH_MATCHED_UUID_REFS,
flattened=True,
portal_env=mock_ff_env)
checker.check_tabs()

checker = TableChecker(SAMPLE_WORKBOOK_WITH_NAME_REFS, portal_env=mock_ff_env)
checker = TableChecker(SAMPLE_WORKBOOK_WITH_NAME_REFS,
flattened=True,
portal_env=mock_ff_env)
checker.check_tabs()
Loading

0 comments on commit 46a2c09

Please sign in to comment.