Skip to content

Commit

Permalink
treat empty files correctly
Browse files Browse the repository at this point in the history
as 0 column files

- refactor type_overrules_type for simplicity
- add tests
  • Loading branch information
bernt-matthias committed Feb 19, 2024
1 parent b7ca525 commit d90a101
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 10 deletions.
17 changes: 8 additions & 9 deletions lib/galaxy/datatypes/tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,18 +450,18 @@ def set_meta(
column_type_compare_order = list(column_type_set_order) # Order to compare column types
column_type_compare_order.reverse()

def type_overrules_type(column_type1, column_type2):
if column_type1 is None or column_type1 == column_type2:
def type_overrules_type(new_column_type, old_column_type):
if new_column_type is None or new_column_type == old_column_type:
return False
if column_type2 is None:
if old_column_type is None:
return True
for column_type in column_type_compare_order:
if column_type1 == column_type:
if new_column_type == column_type:
return True
if column_type2 == column_type:
if old_column_type == column_type:
return False
# neither column type was found in our ordered list, this cannot happen
raise ValueError(f"Tried to compare unknown column types: {column_type1} and {column_type2}")
raise ValueError(f"Tried to compare unknown column types: {new_column_type} and {old_column_type}")

def is_int(column_text):
# Don't allow underscores in numeric literals (PEP 515)
Expand Down Expand Up @@ -508,7 +508,7 @@ def guess_column_type(column_text):
comment_lines = 0
column_names = None
column_types: List = []
first_line_column_types = [default_column_type] # default value is one column of type str
first_line_column_types = []
if dataset.has_data():
# NOTE: if skip > num_check_lines, we won't detect any metadata, and will use default
with compression_utils.get_fileobj(dataset.get_file_name()) as dataset_fh:
Expand Down Expand Up @@ -556,14 +556,13 @@ def guess_column_type(column_text):
comment_lines = None # type: ignore [assignment]
break
i += 1
print(column_types)

# we error on the larger number of columns
# first we pad our column_types by using data from first line
if len(first_line_column_types) > len(column_types):
for column_type in first_line_column_types[len(column_types) :]:
column_types.append(column_type)
# Now we fill any unknown (None) column_types with data from first line
print(column_types)
for i in range(len(column_types)):
if column_types[i] is None:
if len(first_line_column_types) <= i or first_line_column_types[i] is None:
Expand Down
77 changes: 76 additions & 1 deletion test/unit/data/datatypes/test_tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,13 @@ def test_tabular_set_meta_large_file():


def test_tabular_set_meta_empty():
"""
empty file
"""
with tempfile.NamedTemporaryFile(mode="w") as test_file:
test_file.flush()
dataset = MockDataset(id=1)
dataset.file_name = test_file.name
dataset.set_file_name(test_file.name)
Tabular().set_meta(dataset)
# data and comment lines are not stored if more than MAX_DATA_LINES
assert dataset.metadata.data_lines == 0
Expand All @@ -37,3 +40,75 @@ def test_tabular_set_meta_empty():
assert dataset.metadata.columns == 0
assert dataset.metadata.delimiter == "\t"
assert not hasattr(dataset.metadata, "column_names")


def test_tabular_set_meta_nearly_empty():
"""
file just containing a single new line
- empty lines are treated as comments
"""
with tempfile.NamedTemporaryFile(mode="w") as test_file:
test_file.write("\n")
test_file.flush()
dataset = MockDataset(id=1)
dataset.set_file_name(test_file.name)
Tabular().set_meta(dataset)
# data and comment lines are not stored if more than MAX_DATA_LINES
assert dataset.metadata.data_lines == 0
assert dataset.metadata.comment_lines == 1
assert dataset.metadata.column_types == []
assert dataset.metadata.columns == 0
assert dataset.metadata.delimiter == "\t"
assert not hasattr(dataset.metadata, "column_names")


def test_tabular_column_types():
"""
file just containing a single new line
- empty lines are treated as comments
"""
with tempfile.NamedTemporaryFile(mode="w") as test_file:
# 1st line has special treatment which we want to ignore in this test
test_file.write("\t\t\t\t\n")
# note that the 1st column of this line will be detected as None
# but this is overwritten by the default column type (str) after
# checking all lines
test_file.write("\tstr\t23\t42.00\ta,b,c\n")
test_file.flush()
dataset = MockDataset(id=1)
dataset.set_file_name(test_file.name)
Tabular().set_meta(dataset)
# data and comment lines are not stored if more than MAX_DATA_LINES
assert dataset.metadata.data_lines == 2
assert dataset.metadata.comment_lines == 0
assert dataset.metadata.column_types == ["str", "str", "int", "float", "list"]
assert dataset.metadata.columns == 5
assert dataset.metadata.delimiter == "\t"
assert not hasattr(dataset.metadata, "column_names")


def test_tabular_column_types_override():
"""
check that guessed column types can be improved
by the types guessed for later lines
overwriting is only possible in the following order None -> int -> float -> list -> str
also check that more columns can be added by later lines
"""
with tempfile.NamedTemporaryFile(mode="w") as test_file:
# 1st line has special treatment which we want to ignore in this test
test_file.write("\t\t\t\t\n")
# note that the first column in detected as None which can be overwritten by int
test_file.write("\t23\t42.00\ta,b,c\tstr\n")
test_file.write("23\t42.0\t23,42.0\tstr\t42\tanother column\n")
test_file.flush()
dataset = MockDataset(id=1)
dataset.set_file_name(test_file.name)
Tabular().set_meta(dataset)
# data and comment lines are not stored if more than MAX_DATA_LINES
assert dataset.metadata.data_lines == 3
assert dataset.metadata.comment_lines == 0
assert dataset.metadata.column_types == ["int", "float", "list", "str", "str", "str"]
assert dataset.metadata.columns == 6
assert dataset.metadata.delimiter == "\t"
assert not hasattr(dataset.metadata, "column_names")

0 comments on commit d90a101

Please sign in to comment.