From d90a1013311d79b647a95d4caaf13e15544d472e Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Sun, 15 May 2022 14:23:17 +0200 Subject: [PATCH] treat empty files correctly as 0 column files - refactor type_overrules_type for simplicity - add tests --- lib/galaxy/datatypes/tabular.py | 17 +++--- test/unit/data/datatypes/test_tabular.py | 77 +++++++++++++++++++++++- 2 files changed, 84 insertions(+), 10 deletions(-) diff --git a/lib/galaxy/datatypes/tabular.py b/lib/galaxy/datatypes/tabular.py index 6e52675b3966..0e3ed9589519 100644 --- a/lib/galaxy/datatypes/tabular.py +++ b/lib/galaxy/datatypes/tabular.py @@ -450,18 +450,18 @@ def set_meta( column_type_compare_order = list(column_type_set_order) # Order to compare column types column_type_compare_order.reverse() - def type_overrules_type(column_type1, column_type2): - if column_type1 is None or column_type1 == column_type2: + def type_overrules_type(new_column_type, old_column_type): + if new_column_type is None or new_column_type == old_column_type: return False - if column_type2 is None: + if old_column_type is None: return True for column_type in column_type_compare_order: - if column_type1 == column_type: + if new_column_type == column_type: return True - if column_type2 == column_type: + if old_column_type == column_type: return False # neither column type was found in our ordered list, this cannot happen - raise ValueError(f"Tried to compare unknown column types: {column_type1} and {column_type2}") + raise ValueError(f"Tried to compare unknown column types: {new_column_type} and {old_column_type}") def is_int(column_text): # Don't allow underscores in numeric literals (PEP 515) @@ -508,7 +508,7 @@ def guess_column_type(column_text): comment_lines = 0 column_names = None column_types: List = [] - first_line_column_types = [default_column_type] # default value is one column of type str + first_line_column_types = [] if dataset.has_data(): # NOTE: if skip > num_check_lines, we won't detect any metadata, and will use default with compression_utils.get_fileobj(dataset.get_file_name()) as dataset_fh: @@ -556,14 +556,13 @@ def guess_column_type(column_text): comment_lines = None # type: ignore [assignment] break i += 1 - print(column_types) + # we error on the larger number of columns # first we pad our column_types by using data from first line if len(first_line_column_types) > len(column_types): for column_type in first_line_column_types[len(column_types) :]: column_types.append(column_type) # Now we fill any unknown (None) column_types with data from first line - print(column_types) for i in range(len(column_types)): if column_types[i] is None: if len(first_line_column_types) <= i or first_line_column_types[i] is None: diff --git a/test/unit/data/datatypes/test_tabular.py b/test/unit/data/datatypes/test_tabular.py index 002e8509aca3..0e3fe65b1129 100644 --- a/test/unit/data/datatypes/test_tabular.py +++ b/test/unit/data/datatypes/test_tabular.py @@ -25,10 +25,13 @@ def test_tabular_set_meta_large_file(): def test_tabular_set_meta_empty(): + """ + empty file + """ with tempfile.NamedTemporaryFile(mode="w") as test_file: test_file.flush() dataset = MockDataset(id=1) - dataset.file_name = test_file.name + dataset.set_file_name(test_file.name) Tabular().set_meta(dataset) # data and comment lines are not stored if more than MAX_DATA_LINES assert dataset.metadata.data_lines == 0 @@ -37,3 +40,75 @@ def test_tabular_set_meta_empty(): assert dataset.metadata.columns == 0 assert dataset.metadata.delimiter == "\t" assert not hasattr(dataset.metadata, "column_names") + + +def test_tabular_set_meta_nearly_empty(): + """ + file just containing a single new line + - empty lines are treated as comments + """ + with tempfile.NamedTemporaryFile(mode="w") as test_file: + test_file.write("\n") + test_file.flush() + dataset = MockDataset(id=1) + dataset.set_file_name(test_file.name) + Tabular().set_meta(dataset) + # data and comment lines are not stored if more than MAX_DATA_LINES + assert dataset.metadata.data_lines == 0 + assert dataset.metadata.comment_lines == 1 + assert dataset.metadata.column_types == [] + assert dataset.metadata.columns == 0 + assert dataset.metadata.delimiter == "\t" + assert not hasattr(dataset.metadata, "column_names") + + +def test_tabular_column_types(): + """ + file just containing a single new line + - empty lines are treated as comments + """ + with tempfile.NamedTemporaryFile(mode="w") as test_file: + # 1st line has special treatment which we want to ignore in this test + test_file.write("\t\t\t\t\n") + # note that the 1st column of this line will be detected as None + # but this is overwritten by the default column type (str) after + # checking all lines + test_file.write("\tstr\t23\t42.00\ta,b,c\n") + test_file.flush() + dataset = MockDataset(id=1) + dataset.set_file_name(test_file.name) + Tabular().set_meta(dataset) + # data and comment lines are not stored if more than MAX_DATA_LINES + assert dataset.metadata.data_lines == 2 + assert dataset.metadata.comment_lines == 0 + assert dataset.metadata.column_types == ["str", "str", "int", "float", "list"] + assert dataset.metadata.columns == 5 + assert dataset.metadata.delimiter == "\t" + assert not hasattr(dataset.metadata, "column_names") + + +def test_tabular_column_types_override(): + """ + check that guessed column types can be improved + by the types guessed for later lines + overwriting is only possible in the following order None -> int -> float -> list -> str + + also check that more columns can be added by later lines + """ + with tempfile.NamedTemporaryFile(mode="w") as test_file: + # 1st line has special treatment which we want to ignore in this test + test_file.write("\t\t\t\t\n") + # note that the first column in detected as None which can be overwritten by int + test_file.write("\t23\t42.00\ta,b,c\tstr\n") + test_file.write("23\t42.0\t23,42.0\tstr\t42\tanother column\n") + test_file.flush() + dataset = MockDataset(id=1) + dataset.set_file_name(test_file.name) + Tabular().set_meta(dataset) + # data and comment lines are not stored if more than MAX_DATA_LINES + assert dataset.metadata.data_lines == 3 + assert dataset.metadata.comment_lines == 0 + assert dataset.metadata.column_types == ["int", "float", "list", "str", "str", "str"] + assert dataset.metadata.columns == 6 + assert dataset.metadata.delimiter == "\t" + assert not hasattr(dataset.metadata, "column_names")