From 3b26d87aad298d84659536b3653ac804fa854a6b Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Fri, 3 May 2024 18:02:06 +0100 Subject: [PATCH] Drop metadata schemas on tables. Closes #2944 --- python/tests/test_tables.py | 33 +++++++++++++++++++++++++++------ python/tskit/metadata.py | 35 ++++++++++++++++++++++------------- python/tskit/tables.py | 16 ++++++++++++++++ 3 files changed, 65 insertions(+), 19 deletions(-) diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py index 5e045ace88..35e53abf83 100644 --- a/python/tests/test_tables.py +++ b/python/tests/test_tables.py @@ -102,12 +102,20 @@ def make_transposed_input_data(self, num_rows): cols = self.make_input_data(num_rows) return [ { - col: data[j] - if len(data) == num_rows - else ( - bytes(data[cols[f"{col}_offset"][j] : cols[f"{col}_offset"][j + 1]]) - if "metadata" in col - else data[cols[f"{col}_offset"][j] : cols[f"{col}_offset"][j + 1]] + col: ( + data[j] + if len(data) == num_rows + else ( + bytes( + data[ + cols[f"{col}_offset"][j] : cols[f"{col}_offset"][j + 1] + ] + ) + if "metadata" in col + else data[ + cols[f"{col}_offset"][j] : cols[f"{col}_offset"][j + 1] + ] + ) ) for col, data in cols.items() if "offset" not in col @@ -911,6 +919,19 @@ def test_random_metadata(self): ) assert metadatas == unpacked_metadatas + def test_drop_metadata(self): + for num_rows in [1, 10, 100]: + input_data = self.make_input_data(num_rows) + table_no_meta = self.table_class() + table_with_meta = self.table_class() + table_with_meta.set_columns(**input_data) + del input_data["metadata"] + del input_data["metadata_offset"] + table_no_meta.set_columns(**input_data) + assert not table_no_meta.equals(table_with_meta) + table_with_meta.drop_metadata() + table_no_meta.assert_equals(table_with_meta) + def test_optional_metadata(self): if not getattr(self, "metadata_mandatory", False): for num_rows in [0, 10, 100]: diff --git a/python/tskit/metadata.py b/python/tskit/metadata.py index 9b17c2f2cc..3f2b79c0e0 100644 --- a/python/tskit/metadata.py +++ b/python/tskit/metadata.py @@ -263,18 +263,18 @@ def required_validator(validator, required, instance, schema): "type": "string", "pattern": r"^([cbB\?hHiIlLqQfd]|\d*[spx])$", } -struct_meta_schema["definitions"]["root"]["properties"][ - "binaryFormat" -] = struct_meta_schema["properties"]["binaryFormat"] +struct_meta_schema["definitions"]["root"]["properties"]["binaryFormat"] = ( + struct_meta_schema["properties"]["binaryFormat"] +) # arrayLengthFormat matches regex and has default struct_meta_schema["properties"]["arrayLengthFormat"] = { "type": "string", "pattern": r"^[BHILQ]$", "default": "L", } -struct_meta_schema["definitions"]["root"]["properties"][ - "arrayLengthFormat" -] = struct_meta_schema["properties"]["arrayLengthFormat"] +struct_meta_schema["definitions"]["root"]["properties"]["arrayLengthFormat"] = ( + struct_meta_schema["properties"]["arrayLengthFormat"] +) # index is numeric struct_meta_schema["properties"]["index"] = {"type": "number"} struct_meta_schema["definitions"]["root"]["properties"]["index"] = struct_meta_schema[ @@ -285,14 +285,14 @@ def required_validator(validator, required, instance, schema): "type": "string", "default": "utf-8", } -struct_meta_schema["definitions"]["root"]["properties"][ - "stringEncoding" -] = struct_meta_schema["properties"]["stringEncoding"] +struct_meta_schema["definitions"]["root"]["properties"]["stringEncoding"] = ( + struct_meta_schema["properties"]["stringEncoding"] +) # nullTerminated is a boolean struct_meta_schema["properties"]["nullTerminated"] = {"type": "boolean"} -struct_meta_schema["definitions"]["root"]["properties"][ - "nullTerminated" -] = struct_meta_schema["properties"]["nullTerminated"] +struct_meta_schema["definitions"]["root"]["properties"]["nullTerminated"] = ( + struct_meta_schema["properties"]["nullTerminated"] +) # noLengthEncodingExhaustBuffer is a boolean struct_meta_schema["properties"]["noLengthEncodingExhaustBuffer"] = {"type": "boolean"} struct_meta_schema["definitions"]["root"]["properties"][ @@ -722,6 +722,15 @@ def permissive_json(): """ return MetadataSchema({"codec": "json"}) + @staticmethod + def null(): + """ + The null schema which defines no properties and results in raw bytes being returned + on accessing metadata column.s + no constraints on the properties. + """ + return MetadataSchema(None) + # Often many replicate tree sequences are processed with identical schemas, so cache them @functools.lru_cache(maxsize=128) @@ -734,7 +743,7 @@ def parse_metadata_schema(encoded_schema: str) -> MetadataSchema: :return: A subclass of AbstractMetadataSchema. """ if encoded_schema == "": - return MetadataSchema(schema=None) + return MetadataSchema.null() else: try: decoded = json.loads( diff --git a/python/tskit/tables.py b/python/tskit/tables.py index f3184884e9..3195cf436c 100644 --- a/python/tskit/tables.py +++ b/python/tskit/tables.py @@ -794,6 +794,22 @@ def getter(d, k): ) return out + def drop_metadata(self, keep_schema=False): + """ + Drops all metadata in this table. By default, the schema is also cleared, + except if ``keep_schema`` is True. + + :param bool keep_schema: True if the current schema should be kept intact. + """ + if not keep_schema: + self.metadata_schema = metadata.MetadataSchema.null() + data = { + col: getattr(self, col) + for col in self.column_names + if not col.startswith("metadata") + } + self.set_columns(**data) + class IndividualTable(MetadataTable): """