From e2e45e618e57887c6c7dd58aa151e623a2d57a00 Mon Sep 17 00:00:00 2001 From: Bryan Cannon Date: Mon, 8 Jan 2024 13:08:51 -0800 Subject: [PATCH] =?UTF-8?q?Added=20fix=20to=20io=5Futils.list=5Ffiles=20fu?= =?UTF-8?q?nction=20for=20instances=20where=20column=20=E2=80=A6=20(#42)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Added fix to io_utils.list_files function for instances where column numbers go past 0-9. * forgot to add latest version of fix. now updated. * Added double quotations for string in pattern search. * Updated pre-commit * Updated substr matching to account for a list of substrs * Updated matches variable in list_files. * Additionally updated list_folders substr matching with same fix in list_files. * Changed list_files/folder substr to use set matching instead of pure pattern recogntion. * Added a step to regex split to filter out empty tokens. * Updated load_folders functionality to more closely match subsetting by strings. * Add more encompassing test cases * Fix formatting --------- Co-authored-by: alex-l-kong --- src/alpineer/io_utils.py | 17 ++++++++++-- tests/io_utils_test.py | 58 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 72 insertions(+), 3 deletions(-) diff --git a/src/alpineer/io_utils.py b/src/alpineer/io_utils.py index f314612..682068a 100644 --- a/src/alpineer/io_utils.py +++ b/src/alpineer/io_utils.py @@ -1,6 +1,7 @@ import itertools import os import pathlib +import re import warnings from typing import List @@ -81,7 +82,13 @@ def list_files(dir_name, substrs=None, exact_match=False, ignore_hidden=True): if any([substr == os.path.splitext(file)[0] for substr in substrs]) ] else: - matches = [file for file in files if any([substr in file for substr in substrs])] + matches = [] + for substr in substrs: + substr_pattern = list(filter(bool, re.split("[^a-zA-Z0-9]", substr))) + for file in files: + file_pattern = list(filter(bool, re.split("[^a-zA-Z0-9]", file))) + if set(substr_pattern).issubset(file_pattern): + matches.append(file) return matches @@ -226,6 +233,12 @@ def list_folders(dir_name, substrs=None, exact_match=False, ignore_hidden=True): if any([substr == os.path.splitext(folder)[0] for substr in substrs]) ] else: - matches = [folder for folder in folders if any([substr in folder for substr in substrs])] + matches = [] + for substr in substrs: + substr_pattern = list(filter(bool, re.split("[^a-zA-Z0-9]", substr))) + for folder in folders: + folder_pattern = list(filter(bool, re.split("[^a-zA-Z0-9]", folder))) + if set(substr_pattern).issubset(folder_pattern): + matches.append(folder) return matches diff --git a/tests/io_utils_test.py b/tests/io_utils_test.py index cc535db..1e47901 100644 --- a/tests/io_utils_test.py +++ b/tests/io_utils_test.py @@ -127,6 +127,32 @@ def test_list_files(): ) assert sorted(get_hidden_files) == [".chan-metadata.tiff"] + # test delimiter functionality of substr matching + with tempfile.TemporaryDirectory() as temp_dir: + filenames = [ + "fov1.tiff", + "fov1_test.tiff", + "fov10.tiff", + "fov2.tiff", + "fov2_test.tiff", + "fov20.tiff", + "fov3.tiff", + "fov3_test.tiff", + "fov30.tiff", + ] + for filename in filenames: + pathlib.Path(os.path.join(temp_dir, filename)).touch() + + # test substrs is not list (single string) + get_txt = io_utils.list_files(temp_dir, substrs="fov1") + assert sorted(get_txt) == sorted(["fov1.tiff", "fov1_test.tiff"]) + + # test substrs is list + get_test_and_other = io_utils.list_files(temp_dir, substrs=["fov1", "fov2"]) + assert sorted(get_test_and_other) == sorted( + ["fov1.tiff", "fov1_test.tiff", "fov2.tiff", "fov2_test.tiff"] + ) + def test_remove_file_extensions(): # test a mixture of file paths and extensions @@ -206,7 +232,7 @@ def test_list_folders(): temp_dir, substrs=["test_", "other"], exact_match=False ) assert sorted(get_test_and_other) == sorted( - ["Ntest_csv", "test_csv", "test_csv1", "test_csv2", "test_out", "othertf_txt"] + ["test_csv", "test_csv1", "test_csv2", "test_out"] ) # Test hidden files @@ -243,3 +269,33 @@ def test_list_folders(): temp_dir, substrs=".hidden_dir", exact_match=True, ignore_hidden=False ) assert get_hidden_dirs == [".hidden_dir"] + + # test delimiter functionality of substr matching + with tempfile.TemporaryDirectory() as temp_dir: + dirnames = [ + "test1", + "test1_folder", + "test10", + "test2", + "test2_folder", + "test20", + "test3", + "test3_folder", + "test30", + ] + + dirnames.sort() + for dirname in dirnames: + os.mkdir(os.path.join(temp_dir, dirname)) + + # test substrs is not list (single string) + get_txt = io_utils.list_folders(temp_dir, substrs="test1", exact_match=False) + assert sorted(get_txt) == sorted(["test1", "test1_folder"]) + + # test substrs is list + get_test_and_other = io_utils.list_folders( + temp_dir, substrs=["test1", "test2"], exact_match=False + ) + assert sorted(get_test_and_other) == sorted( + ["test1", "test1_folder", "test2", "test2_folder"] + )