From d90a1013311d79b647a95d4caaf13e15544d472e Mon Sep 17 00:00:00 2001
From: Matthias Bernt <m.bernt@ufz.de>
Date: Sun, 15 May 2022 14:23:17 +0200
Subject: [PATCH] treat empty files correctly

as 0 column files

- refactor type_overrules_type for simplicity
- add tests
---
 lib/galaxy/datatypes/tabular.py          | 17 +++---
 test/unit/data/datatypes/test_tabular.py | 77 +++++++++++++++++++++++-
 2 files changed, 84 insertions(+), 10 deletions(-)

diff --git a/lib/galaxy/datatypes/tabular.py b/lib/galaxy/datatypes/tabular.py
index 6e52675b3966..0e3ed9589519 100644
--- a/lib/galaxy/datatypes/tabular.py
+++ b/lib/galaxy/datatypes/tabular.py
@@ -450,18 +450,18 @@ def set_meta(
         column_type_compare_order = list(column_type_set_order)  # Order to compare column types
         column_type_compare_order.reverse()
 
-        def type_overrules_type(column_type1, column_type2):
-            if column_type1 is None or column_type1 == column_type2:
+        def type_overrules_type(new_column_type, old_column_type):
+            if new_column_type is None or new_column_type == old_column_type:
                 return False
-            if column_type2 is None:
+            if old_column_type is None:
                 return True
             for column_type in column_type_compare_order:
-                if column_type1 == column_type:
+                if new_column_type == column_type:
                     return True
-                if column_type2 == column_type:
+                if old_column_type == column_type:
                     return False
             # neither column type was found in our ordered list, this cannot happen
-            raise ValueError(f"Tried to compare unknown column types: {column_type1} and {column_type2}")
+            raise ValueError(f"Tried to compare unknown column types: {new_column_type} and {old_column_type}")
 
         def is_int(column_text):
             # Don't allow underscores in numeric literals (PEP 515)
@@ -508,7 +508,7 @@ def guess_column_type(column_text):
         comment_lines = 0
         column_names = None
         column_types: List = []
-        first_line_column_types = [default_column_type]  # default value is one column of type str
+        first_line_column_types = []
         if dataset.has_data():
             # NOTE: if skip > num_check_lines, we won't detect any metadata, and will use default
             with compression_utils.get_fileobj(dataset.get_file_name()) as dataset_fh:
@@ -556,14 +556,13 @@ def guess_column_type(column_text):
                             comment_lines = None  # type: ignore [assignment]
                         break
                     i += 1
-        print(column_types)
+
         # we error on the larger number of columns
         # first we pad our column_types by using data from first line
         if len(first_line_column_types) > len(column_types):
             for column_type in first_line_column_types[len(column_types) :]:
                 column_types.append(column_type)
         # Now we fill any unknown (None) column_types with data from first line
-        print(column_types)
         for i in range(len(column_types)):
             if column_types[i] is None:
                 if len(first_line_column_types) <= i or first_line_column_types[i] is None:
diff --git a/test/unit/data/datatypes/test_tabular.py b/test/unit/data/datatypes/test_tabular.py
index 002e8509aca3..0e3fe65b1129 100644
--- a/test/unit/data/datatypes/test_tabular.py
+++ b/test/unit/data/datatypes/test_tabular.py
@@ -25,10 +25,13 @@ def test_tabular_set_meta_large_file():
 
 
 def test_tabular_set_meta_empty():
+    """
+    empty file
+    """
     with tempfile.NamedTemporaryFile(mode="w") as test_file:
         test_file.flush()
         dataset = MockDataset(id=1)
-        dataset.file_name = test_file.name
+        dataset.set_file_name(test_file.name)
         Tabular().set_meta(dataset)
         # data and comment lines are not stored if more than MAX_DATA_LINES
         assert dataset.metadata.data_lines == 0
@@ -37,3 +40,75 @@ def test_tabular_set_meta_empty():
         assert dataset.metadata.columns == 0
         assert dataset.metadata.delimiter == "\t"
         assert not hasattr(dataset.metadata, "column_names")
+
+
+def test_tabular_set_meta_nearly_empty():
+    """
+    file just containing a single new line
+    - empty lines are treated as comments
+    """
+    with tempfile.NamedTemporaryFile(mode="w") as test_file:
+        test_file.write("\n")
+        test_file.flush()
+        dataset = MockDataset(id=1)
+        dataset.set_file_name(test_file.name)
+        Tabular().set_meta(dataset)
+        # data and comment lines are not stored if more than MAX_DATA_LINES
+        assert dataset.metadata.data_lines == 0
+        assert dataset.metadata.comment_lines == 1
+        assert dataset.metadata.column_types == []
+        assert dataset.metadata.columns == 0
+        assert dataset.metadata.delimiter == "\t"
+        assert not hasattr(dataset.metadata, "column_names")
+
+
+def test_tabular_column_types():
+    """
+    file just containing a single new line
+    - empty lines are treated as comments
+    """
+    with tempfile.NamedTemporaryFile(mode="w") as test_file:
+        # 1st line has special treatment which we want to ignore in this test
+        test_file.write("\t\t\t\t\n")
+        # note that the 1st column of this line will be detected as None
+        # but this is overwritten by the default column type (str) after
+        # checking all lines
+        test_file.write("\tstr\t23\t42.00\ta,b,c\n")
+        test_file.flush()
+        dataset = MockDataset(id=1)
+        dataset.set_file_name(test_file.name)
+        Tabular().set_meta(dataset)
+        # data and comment lines are not stored if more than MAX_DATA_LINES
+        assert dataset.metadata.data_lines == 2
+        assert dataset.metadata.comment_lines == 0
+        assert dataset.metadata.column_types == ["str", "str", "int", "float", "list"]
+        assert dataset.metadata.columns == 5
+        assert dataset.metadata.delimiter == "\t"
+        assert not hasattr(dataset.metadata, "column_names")
+
+
+def test_tabular_column_types_override():
+    """
+    check that guessed column types can be improved
+    by the types guessed for later lines
+    overwriting is only possible in the following order None -> int -> float -> list -> str
+
+    also check that more columns can be added by later lines
+    """
+    with tempfile.NamedTemporaryFile(mode="w") as test_file:
+        # 1st line has special treatment which we want to ignore in this test
+        test_file.write("\t\t\t\t\n")
+        # note that the first column in detected as None which can be overwritten by int
+        test_file.write("\t23\t42.00\ta,b,c\tstr\n")
+        test_file.write("23\t42.0\t23,42.0\tstr\t42\tanother column\n")
+        test_file.flush()
+        dataset = MockDataset(id=1)
+        dataset.set_file_name(test_file.name)
+        Tabular().set_meta(dataset)
+        # data and comment lines are not stored if more than MAX_DATA_LINES
+        assert dataset.metadata.data_lines == 3
+        assert dataset.metadata.comment_lines == 0
+        assert dataset.metadata.column_types == ["int", "float", "list", "str", "str", "str"]
+        assert dataset.metadata.columns == 6
+        assert dataset.metadata.delimiter == "\t"
+        assert not hasattr(dataset.metadata, "column_names")