Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make columns types an empty list for empty tabular data #13918

Merged
merged 5 commits into from
Mar 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions lib/galaxy/datatypes/tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,18 +450,18 @@ def set_meta(
column_type_compare_order = list(column_type_set_order) # Order to compare column types
column_type_compare_order.reverse()

def type_overrules_type(column_type1, column_type2):
if column_type1 is None or column_type1 == column_type2:
def type_overrules_type(new_column_type, old_column_type):
if new_column_type is None or new_column_type == old_column_type:
return False
if column_type2 is None:
if old_column_type is None:
return True
for column_type in column_type_compare_order:
if column_type1 == column_type:
if new_column_type == column_type:
return True
if column_type2 == column_type:
if old_column_type == column_type:
return False
# neither column type was found in our ordered list, this cannot happen
raise ValueError(f"Tried to compare unknown column types: {column_type1} and {column_type2}")
raise ValueError(f"Tried to compare unknown column types: {new_column_type} and {old_column_type}")

def is_int(column_text):
# Don't allow underscores in numeric literals (PEP 515)
Expand Down Expand Up @@ -508,7 +508,7 @@ def guess_column_type(column_text):
comment_lines = 0
column_names = None
column_types: List = []
first_line_column_types = [default_column_type] # default value is one column of type str
first_line_column_types = []
if dataset.has_data():
# NOTE: if skip > num_check_lines, we won't detect any metadata, and will use default
with compression_utils.get_fileobj(dataset.get_file_name()) as dataset_fh:
Expand Down
97 changes: 97 additions & 0 deletions test/unit/data/datatypes/test_tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,100 @@ def test_tabular_set_meta_large_file():
dataset = MockDataset(id=1)
dataset.set_file_name(test_file.name)
Tabular().set_meta(dataset) # type: ignore [arg-type]
# data and comment lines are not stored if more than MAX_DATA_LINES
assert dataset.metadata.data_lines is None
assert dataset.metadata.comment_lines is None
assert dataset.metadata.column_types == ["str", "str"]
assert dataset.metadata.columns == 2
assert dataset.metadata.delimiter == "\t"
assert not hasattr(dataset.metadata, "column_names")


def test_tabular_set_meta_empty():
"""
empty file
"""
with tempfile.NamedTemporaryFile(mode="w") as test_file:
test_file.flush()
dataset = MockDataset(id=1)
dataset.set_file_name(test_file.name)
Tabular().set_meta(dataset) # type: ignore [arg-type]
# data and comment lines are not stored if more than MAX_DATA_LINES
assert dataset.metadata.data_lines == 0
assert dataset.metadata.comment_lines == 0
assert dataset.metadata.column_types == []
assert dataset.metadata.columns == 0
assert dataset.metadata.delimiter == "\t"
assert not hasattr(dataset.metadata, "column_names")


def test_tabular_set_meta_nearly_empty():
"""
file just containing a single new line
- empty lines are treated as comments
"""
with tempfile.NamedTemporaryFile(mode="w") as test_file:
test_file.write("\n")
test_file.flush()
dataset = MockDataset(id=1)
dataset.set_file_name(test_file.name)
Tabular().set_meta(dataset) # type: ignore [arg-type]
# data and comment lines are not stored if more than MAX_DATA_LINES
assert dataset.metadata.data_lines == 0
assert dataset.metadata.comment_lines == 1
assert dataset.metadata.column_types == []
assert dataset.metadata.columns == 0
assert dataset.metadata.delimiter == "\t"
assert not hasattr(dataset.metadata, "column_names")


def test_tabular_column_types():
"""
file containing a single containing only tab characters terminated with a new line character
- empty lines are treated as comments
"""
with tempfile.NamedTemporaryFile(mode="w") as test_file:
# 1st line has special treatment which we want to ignore in this test
test_file.write("\t\t\t\t\n")
# note that the 1st column of this line will be detected as None
# but this is overwritten by the default column type (str) after
# checking all lines
test_file.write("\tstr\t23\t42.00\ta,b,c\n")
test_file.flush()
dataset = MockDataset(id=1)
dataset.set_file_name(test_file.name)
Tabular().set_meta(dataset) # type: ignore [arg-type]
# data and comment lines are not stored if more than MAX_DATA_LINES
assert dataset.metadata.data_lines == 2
assert dataset.metadata.comment_lines == 0
assert dataset.metadata.column_types == ["str", "str", "int", "float", "list"]
assert dataset.metadata.columns == 5
assert dataset.metadata.delimiter == "\t"
assert not hasattr(dataset.metadata, "column_names")


def test_tabular_column_types_override():
"""
check that guessed column types can be improved
by the types guessed for later lines
overwriting is only possible in the following order None -> int -> float -> list -> str

also check that more columns can be added by later lines
"""
with tempfile.NamedTemporaryFile(mode="w") as test_file:
# 1st line has special treatment which we want to ignore in this test
test_file.write("\t\t\t\t\n")
# note that the first column in detected as None which can be overwritten by int
test_file.write("\t23\t42.00\ta,b,c\tstr\n")
test_file.write("23\t42.0\t23,42.0\tstr\t42\tanother column\n")
test_file.flush()
dataset = MockDataset(id=1)
dataset.set_file_name(test_file.name)
Tabular().set_meta(dataset) # type: ignore [arg-type]
# data and comment lines are not stored if more than MAX_DATA_LINES
assert dataset.metadata.data_lines == 3
assert dataset.metadata.comment_lines == 0
assert dataset.metadata.column_types == ["int", "float", "list", "str", "str", "str"]
assert dataset.metadata.columns == 6
assert dataset.metadata.delimiter == "\t"
assert not hasattr(dataset.metadata, "column_names")
3 changes: 2 additions & 1 deletion test/unit/data/datatypes/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Optional

from galaxy.datatypes.sniff import get_test_fname
from galaxy.util.bunch import Bunch
from galaxy.util.hash_util import md5_hash_file


Expand All @@ -20,7 +21,7 @@ def set_file_name(self, file_name):
self.file_name_ = file_name


class MockMetadata:
class MockMetadata(Bunch):
file_name_: Optional[str] = None

def get_file_name(self, sync_cache=True):
Expand Down
Loading