treat empty files correctly

as 0 column files - refactor type_overrules_type for simplicity - add tests
galaxyproject · May 16, 2022 · 2604602 · 2604602
1 parent 7d9de54
commit 2604602
Show file tree

Hide file tree

Showing 2 changed files with 84 additions and 10 deletions.
diff --git a/lib/galaxy/datatypes/tabular.py b/lib/galaxy/datatypes/tabular.py
@@ -359,18 +359,18 @@ def set_meta(
         column_type_compare_order = list(column_type_set_order)  # Order to compare column types
         column_type_compare_order.reverse()
 
-        def type_overrules_type(column_type1, column_type2):
-            if column_type1 is None or column_type1 == column_type2:
+        def type_overrules_type(new_column_type, old_column_type):
+            if new_column_type is None or new_column_type == old_column_type:
                 return False
-            if column_type2 is None:
+            if old_column_type is None:
                 return True
             for column_type in column_type_compare_order:
-                if column_type1 == column_type:
+                if new_column_type == column_type:
                     return True
-                if column_type2 == column_type:
+                if old_column_type == column_type:
                     return False
             # neither column type was found in our ordered list, this cannot happen
-            raise ValueError(f"Tried to compare unknown column types: {column_type1} and {column_type2}")
+            raise ValueError(f"Tried to compare unknown column types: {new_column_type} and {old_column_type}")
 
         def is_int(column_text):
             # Don't allow underscores in numeric literals (PEP 515)
@@ -417,7 +417,7 @@ def guess_column_type(column_text):
         comment_lines = 0
         column_names = None
         column_types = []
-        first_line_column_types = [default_column_type]  # default value is one column of type str
+        first_line_column_types = []
         if dataset.has_data():
             # NOTE: if skip > num_check_lines, we won't detect any metadata, and will use default
             with compression_utils.get_fileobj(dataset.file_name) as dataset_fh:
@@ -463,14 +463,13 @@ def guess_column_type(column_text):
                             comment_lines = None  # Clear optional comment_lines metadata value; additional comment lines could appear below this point
                         break
                     i += 1
-        print(column_types)
+
         # we error on the larger number of columns
         # first we pad our column_types by using data from first line
         if len(first_line_column_types) > len(column_types):
             for column_type in first_line_column_types[len(column_types) :]:
                 column_types.append(column_type)
         # Now we fill any unknown (None) column_types with data from first line
-        print(column_types)
         for i in range(len(column_types)):
             if column_types[i] is None:
                 if len(first_line_column_types) <= i or first_line_column_types[i] is None:

diff --git a/test/unit/data/datatypes/test_tabular.py b/test/unit/data/datatypes/test_tabular.py
@@ -25,6 +25,9 @@ def test_tabular_set_meta_large_file():
 
 
 def test_tabular_set_meta_empty():
+    """
+    empty file
+    """
     with tempfile.NamedTemporaryFile(mode="w") as test_file:
         test_file.flush()
         dataset = MockDataset(id=1)
@@ -36,4 +39,76 @@ def test_tabular_set_meta_empty():
         assert dataset.metadata.column_types == []
         assert dataset.metadata.columns == 0
         assert dataset.metadata.delimiter == "\t"
-        assert not hasattr(dataset.metadata, "column_names")
+        assert not hasattr(dataset.metadata, "column_names")
+
+
+def test_tabular_set_meta_nearly_empty():
+    """
+    file just containing a single new line
+    - empty lines are treated as comments
+    """
+    with tempfile.NamedTemporaryFile(mode="w") as test_file:
+        test_file.write("\n")
+        test_file.flush()
+        dataset = MockDataset(id=1)
+        dataset.file_name = test_file.name
+        Tabular().set_meta(dataset)
+        # data and comment lines are not stored if more than MAX_DATA_LINES
+        assert dataset.metadata.data_lines == 0
+        assert dataset.metadata.comment_lines == 1
+        assert dataset.metadata.column_types == []
+        assert dataset.metadata.columns == 0
+        assert dataset.metadata.delimiter == "\t"
+        assert not hasattr(dataset.metadata, "column_names")
+
+
+def test_tabular_column_types():
+    """
+    file just containing a single new line
+    - empty lines are treated as comments
+    """
+    with tempfile.NamedTemporaryFile(mode="w") as test_file:
+        # 1st line has special treatment which we want to ignore in this test
+        test_file.write("\t\t\t\t\n")
+        # note that the 1st column of this line will be detected as None
+        # but this is overwritten by the default column type (str) after
+        # checking all lines
+        test_file.write("\tstr\t23\t42.00\ta,b,c\n")
+        test_file.flush()
+        dataset = MockDataset(id=1)
+        dataset.file_name = test_file.name
+        Tabular().set_meta(dataset)
+        # data and comment lines are not stored if more than MAX_DATA_LINES
+        assert dataset.metadata.data_lines == 2
+        assert dataset.metadata.comment_lines == 0
+        assert dataset.metadata.column_types == ["str", "str", "int", "float", "list"]
+        assert dataset.metadata.columns == 5
+        assert dataset.metadata.delimiter == "\t"
+        assert not hasattr(dataset.metadata, "column_names")
+
+
+def test_tabular_column_types_override():
+    """
+    check that guessed column types can be improved
+    by the types guessed for later lines
+    overwriting is only possible in the following order None -> int -> float -> list -> str
+
+    also check that more columns can be added by later lines
+    """
+    with tempfile.NamedTemporaryFile(mode="w") as test_file:
+        # 1st line has special treatment which we want to ignore in this test
+        test_file.write("\t\t\t\t\n")
+        # note that the first column in detected as None which can be overwritten by int
+        test_file.write("\t23\t42.00\ta,b,c\tstr\n")
+        test_file.write("23\t42.0\t23,42.0\tstr\t42\tanother column\n")
+        test_file.flush()
+        dataset = MockDataset(id=1)
+        dataset.file_name = test_file.name
+        Tabular().set_meta(dataset)
+        # data and comment lines are not stored if more than MAX_DATA_LINES
+        assert dataset.metadata.data_lines == 3
+        assert dataset.metadata.comment_lines == 0
+        assert dataset.metadata.column_types == ["int", "float", "list", "str", "str", "str"]
+        assert dataset.metadata.columns == 6
+        assert dataset.metadata.delimiter == "\t"
+        assert not hasattr(dataset.metadata, "column_names")