Skip to content

Commit

Permalink
treat empty files correctly
Browse files Browse the repository at this point in the history
as 0 column files

- refactor type_overrules_type for simplicity
- add tests
  • Loading branch information
bernt-matthias committed May 16, 2022
1 parent 7d9de54 commit 2604602
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 10 deletions.
17 changes: 8 additions & 9 deletions lib/galaxy/datatypes/tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,18 +359,18 @@ def set_meta(
column_type_compare_order = list(column_type_set_order) # Order to compare column types
column_type_compare_order.reverse()

def type_overrules_type(column_type1, column_type2):
if column_type1 is None or column_type1 == column_type2:
def type_overrules_type(new_column_type, old_column_type):
if new_column_type is None or new_column_type == old_column_type:
return False
if column_type2 is None:
if old_column_type is None:
return True
for column_type in column_type_compare_order:
if column_type1 == column_type:
if new_column_type == column_type:
return True
if column_type2 == column_type:
if old_column_type == column_type:
return False
# neither column type was found in our ordered list, this cannot happen
raise ValueError(f"Tried to compare unknown column types: {column_type1} and {column_type2}")
raise ValueError(f"Tried to compare unknown column types: {new_column_type} and {old_column_type}")

def is_int(column_text):
# Don't allow underscores in numeric literals (PEP 515)
Expand Down Expand Up @@ -417,7 +417,7 @@ def guess_column_type(column_text):
comment_lines = 0
column_names = None
column_types = []
first_line_column_types = [default_column_type] # default value is one column of type str
first_line_column_types = []
if dataset.has_data():
# NOTE: if skip > num_check_lines, we won't detect any metadata, and will use default
with compression_utils.get_fileobj(dataset.file_name) as dataset_fh:
Expand Down Expand Up @@ -463,14 +463,13 @@ def guess_column_type(column_text):
comment_lines = None # Clear optional comment_lines metadata value; additional comment lines could appear below this point
break
i += 1
print(column_types)

# we error on the larger number of columns
# first we pad our column_types by using data from first line
if len(first_line_column_types) > len(column_types):
for column_type in first_line_column_types[len(column_types) :]:
column_types.append(column_type)
# Now we fill any unknown (None) column_types with data from first line
print(column_types)
for i in range(len(column_types)):
if column_types[i] is None:
if len(first_line_column_types) <= i or first_line_column_types[i] is None:
Expand Down
77 changes: 76 additions & 1 deletion test/unit/data/datatypes/test_tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ def test_tabular_set_meta_large_file():


def test_tabular_set_meta_empty():
"""
empty file
"""
with tempfile.NamedTemporaryFile(mode="w") as test_file:
test_file.flush()
dataset = MockDataset(id=1)
Expand All @@ -36,4 +39,76 @@ def test_tabular_set_meta_empty():
assert dataset.metadata.column_types == []
assert dataset.metadata.columns == 0
assert dataset.metadata.delimiter == "\t"
assert not hasattr(dataset.metadata, "column_names")
assert not hasattr(dataset.metadata, "column_names")


def test_tabular_set_meta_nearly_empty():
"""
file just containing a single new line
- empty lines are treated as comments
"""
with tempfile.NamedTemporaryFile(mode="w") as test_file:
test_file.write("\n")
test_file.flush()
dataset = MockDataset(id=1)
dataset.file_name = test_file.name
Tabular().set_meta(dataset)
# data and comment lines are not stored if more than MAX_DATA_LINES
assert dataset.metadata.data_lines == 0
assert dataset.metadata.comment_lines == 1
assert dataset.metadata.column_types == []
assert dataset.metadata.columns == 0
assert dataset.metadata.delimiter == "\t"
assert not hasattr(dataset.metadata, "column_names")


def test_tabular_column_types():
"""
file just containing a single new line
- empty lines are treated as comments
"""
with tempfile.NamedTemporaryFile(mode="w") as test_file:
# 1st line has special treatment which we want to ignore in this test
test_file.write("\t\t\t\t\n")
# note that the 1st column of this line will be detected as None
# but this is overwritten by the default column type (str) after
# checking all lines
test_file.write("\tstr\t23\t42.00\ta,b,c\n")
test_file.flush()
dataset = MockDataset(id=1)
dataset.file_name = test_file.name
Tabular().set_meta(dataset)
# data and comment lines are not stored if more than MAX_DATA_LINES
assert dataset.metadata.data_lines == 2
assert dataset.metadata.comment_lines == 0
assert dataset.metadata.column_types == ["str", "str", "int", "float", "list"]
assert dataset.metadata.columns == 5
assert dataset.metadata.delimiter == "\t"
assert not hasattr(dataset.metadata, "column_names")


def test_tabular_column_types_override():
"""
check that guessed column types can be improved
by the types guessed for later lines
overwriting is only possible in the following order None -> int -> float -> list -> str
also check that more columns can be added by later lines
"""
with tempfile.NamedTemporaryFile(mode="w") as test_file:
# 1st line has special treatment which we want to ignore in this test
test_file.write("\t\t\t\t\n")
# note that the first column in detected as None which can be overwritten by int
test_file.write("\t23\t42.00\ta,b,c\tstr\n")
test_file.write("23\t42.0\t23,42.0\tstr\t42\tanother column\n")
test_file.flush()
dataset = MockDataset(id=1)
dataset.file_name = test_file.name
Tabular().set_meta(dataset)
# data and comment lines are not stored if more than MAX_DATA_LINES
assert dataset.metadata.data_lines == 3
assert dataset.metadata.comment_lines == 0
assert dataset.metadata.column_types == ["int", "float", "list", "str", "str", "str"]
assert dataset.metadata.columns == 6
assert dataset.metadata.delimiter == "\t"
assert not hasattr(dataset.metadata, "column_names")

0 comments on commit 2604602

Please sign in to comment.