From 12fa4a4b690868cfeff27279959b7216b75b3e34 Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Fri, 19 Jan 2024 13:41:35 -0500
Subject: [PATCH 01/30] testing initial refactor

---
 .../codex_common_errors_validator.py          | 297 +++++++++---------
 .../codex_json_validator.py                   |  20 +-
 .../fastq_validator.py                        |   7 +-
 .../fastq_validator_logic.py                  |  41 ++-
 src/ingest_validation_tests/gz_validator.py   |  53 ++--
 .../ome_tiff_validator.py                     |  64 +++-
 .../publication_validator.py                  |  73 ++---
 src/ingest_validation_tests/tiff_validator.py |  68 +++-
 8 files changed, 362 insertions(+), 261 deletions(-)

diff --git a/src/ingest_validation_tests/codex_common_errors_validator.py b/src/ingest_validation_tests/codex_common_errors_validator.py
index 01bda98..e98c3c6 100644
--- a/src/ingest_validation_tests/codex_common_errors_validator.py
+++ b/src/ingest_validation_tests/codex_common_errors_validator.py
@@ -12,6 +12,7 @@ class QuitNowException(Exception):
     """
     Signal exit from this validation test
     """
+
     pass
 
 
@@ -43,166 +44,178 @@ class CodexCommonErrorsValidator(Validator):
     Test for some common errors in the directory and file structure of
     CODEX datasets.
     """
+
     description = "Test for common problems found in CODEX"
     cost = 1.0
+
     def collect_errors(self, **kwargs) -> List[str]:
         """
         Return the errors found by this validator
         """
+        del kwargs
         if self.assay_type != 'CODEX':
             return []  # We only test CODEX data
-        rslt = []
-        try:
-            # is the raw/src_ directory present?
-            prefix = None
-            if (self.path / 'raw').is_dir():
-                prefix = self.path / 'raw'
-            else:
-                for candidate in self.path.glob('src_*'):
-                    prefix = candidate
-            if prefix is None:
-                rslt.append('The raw/src_ subdirectory is missing?')
-                raise QuitNowException()
-
-            # Does dataset.json exist?  If so, 'new CODEX' syntax rules
-            # are in effect
-            dataset_json_exists = False
-            any_dataset_json_exists = False
-            for candidate in self.path.glob('**/dataset.json'):
-                any_dataset_json_exists = True
-                if candidate == prefix / 'dataset.json':
-                    dataset_json_exists = True
-            if dataset_json_exists:
-                print('FOUND dataset.json; skipping further analysis')
-                raise QuitNowException()
-            elif any_dataset_json_exists:
-                rslt.append('A dataset.json file exists but'
-                            ' is in the wrong place')
-
-            # is the segmentation.json file on the right side?
-            found = False
-            right_place = False
-            for path in self.path.glob('*/[Ss]egmentation.json'):
-                rel_path = path.relative_to(self.path)
-                found = True
-                if str(rel_path).startswith(('raw', 'src_')):
-                    right_place = True
-            if found:
-                if right_place:
-                    pass
+        rslts = []
+        for path in self.paths:
+            rslt = []
+            try:
+                # is the raw/src_ directory present?
+                prefix = None
+                if (path / 'raw').is_dir():
+                    prefix = path / 'raw'
                 else:
-                    rslt.append('The segmentation.json file is in the wrong subdirectory')
-            else:
-                rslt.append('The segmentation.json file is missing or misplaced')
-
-            # Does the channelnames.txt file exist?
-            channelnames_txt_path = prefix / 'channelnames.txt'
-            if not channelnames_txt_path.is_file():
-                # sometimes we see this variant
-                channelnames_txt_path = prefix / 'channelNames.txt'
-                if not channelnames_txt_path.is_file():
-                    rslt.append('channelnames.txt is missing')
+                    for candidate in path.glob('src_*'):
+                        prefix = candidate
+                if prefix is None:
+                    rslt.append('The raw/src_ subdirectory is missing?')
                     raise QuitNowException()
 
-            # Parse channelnames.txt into a dataframe
-            try:
-                cn_df = pd.read_csv(str(channelnames_txt_path), header=None)
-            except Exception:
-                rslt.append(f'Unexpected error reading {channelnames_txt_path}')
-                raise QuitNowException()
-            if len(cn_df.columns) != 1:
-                rslt.append(f'Unexpected format for {channelnames_txt_path}')
-                raise QuitNowException()
-
-            # Does the channelnames_report.csv file exist?
-            report_csv_path = prefix / 'channelnames_report.csv'
-            if report_csv_path.is_file():
-                # Parse channelnames_report.txt into a dataframe
+                # Does dataset.json exist?  If so, 'new CODEX' syntax rules
+                # are in effect
+                dataset_json_exists = False
+                any_dataset_json_exists = False
+                for candidate in path.glob('**/dataset.json'):
+                    any_dataset_json_exists = True
+                    if candidate == prefix / 'dataset.json':
+                        dataset_json_exists = True
+                if dataset_json_exists:
+                    print('FOUND dataset.json; skipping further analysis')
+                    raise QuitNowException()
+                elif any_dataset_json_exists:
+                    rslt.append(
+                        'A dataset.json file exists but'
+                        ' is in the wrong place'
+                    )
+
+                # is the segmentation.json file on the right side?
+                found = False
+                right_place = False
+                for path in path.glob('*/[Ss]egmentation.json'):
+                    rel_path = path.relative_to(path)
+                    found = True
+                    if str(rel_path).startswith(('raw', 'src_')):
+                        right_place = True
+                if found:
+                    if right_place:
+                        pass
+                    else:
+                        rslt.append(
+                            'The segmentation.json file is in the wrong subdirectory'
+                        )
+                else:
+                    rslt.append('The segmentation.json file is missing or misplaced')
+
+                # Does the channelnames.txt file exist?
+                channelnames_txt_path = prefix / 'channelnames.txt'
+                if not channelnames_txt_path.is_file():
+                    # sometimes we see this variant
+                    channelnames_txt_path = prefix / 'channelNames.txt'
+                    if not channelnames_txt_path.is_file():
+                        rslt.append('channelnames.txt is missing')
+                        raise QuitNowException()
+
+                # Parse channelnames.txt into a dataframe
                 try:
-                    rpt_df = pd.read_csv(str(report_csv_path), sep=',', header=None)
+                    cn_df = pd.read_csv(str(channelnames_txt_path), header=None)
                 except Exception:
-                    rslt.append(f'Unexpected error reading {report_csv_path}')
+                    rslt.append(f'Unexpected error reading {channelnames_txt_path}')
                     raise QuitNowException()
-                if len(rpt_df) == len(cn_df) + 1:
-                    # channelnames_report.csv appears to have a header
+                if len(cn_df.columns) != 1:
+                    rslt.append(f'Unexpected format for {channelnames_txt_path}')
+                    raise QuitNowException()
+
+                # Does the channelnames_report.csv file exist?
+                report_csv_path = prefix / 'channelnames_report.csv'
+                if report_csv_path.is_file():
+                    # Parse channelnames_report.txt into a dataframe
                     try:
-                        rpt_df = pd.read_csv(str(report_csv_path), sep=',')
+                        rpt_df = pd.read_csv(str(report_csv_path), sep=',', header=None)
                     except Exception:
                         rslt.append(f'Unexpected error reading {report_csv_path}')
                         raise QuitNowException()
-                if len(rpt_df.columns) != 2:
-                    rslt.append(f'Could not parse {report_csv_path}.'
-                                ' Is it a comma-separated table?'
-                    )
-                    raise QuitNowException()
-                col_0, col_1 = rpt_df.columns
-                rpt_df = rpt_df.rename(columns={col_0:'Marker', col_1:'Result'})
-                # Do they match?
-                rpt_df['other'] = cn_df[0]
-                mismatches_df = rpt_df[rpt_df['other'] != rpt_df['Marker']]
-                if len(mismatches_df) != 0:
-                    for idx, row in mismatches_df.iterrows():
+                    if len(rpt_df) == len(cn_df) + 1:
+                        # channelnames_report.csv appears to have a header
+                        try:
+                            rpt_df = pd.read_csv(str(report_csv_path), sep=',')
+                        except Exception:
+                            rslt.append(f'Unexpected error reading {report_csv_path}')
+                            raise QuitNowException()
+                    if len(rpt_df.columns) != 2:
                         rslt.append(
-                            f"{channelnames_txt_path.name} does not"
-                            " match channelnames_report.txt"
-                            f" on line {idx}: {row['other']} vs {row['Marker']}"
+                            f'Could not parse {report_csv_path}.'
+                            ' Is it a comma-separated table?'
                         )
+                        raise QuitNowException()
+                    col_0, col_1 = rpt_df.columns
+                    rpt_df = rpt_df.rename(columns={col_0: 'Marker', col_1: 'Result'})
+                    # Do they match?
+                    rpt_df['other'] = cn_df[0]
+                    mismatches_df = rpt_df[rpt_df['other'] != rpt_df['Marker']]
+                    if len(mismatches_df) != 0:
+                        for idx, row in mismatches_df.iterrows():
+                            rslt.append(
+                                f'{channelnames_txt_path.name} does not'
+                                ' match channelnames_report.txt'
+                                f' on line {idx}: {row["other"]} vs {row["Marker"]}'
+                            )
+                        raise QuitNowException()
+                else:
+                    rpt_df = None
+
+                # Tabulate the cycle and region info
+                all_cycle_dirs = []
+                for glob_str in ['cyc*', 'Cyc*']:
+                    for pth in prefix.glob(glob_str):
+                        if pth.is_dir():
+                            all_cycle_dirs.append(str(pth.stem).lower())
+                cycles = []
+                regions = []
+                failures = []
+                for cyc_dir in all_cycle_dirs:
+                    try:
+                        cyc_id, reg_id = _split_cycle_dir_string(cyc_dir)
+                        cycles.append(cyc_id)
+                        regions.append(reg_id)
+                    except AssertionError as excp:
+                        failures.append(str(excp))
+                if failures:
+                    rslt += failures
                     raise QuitNowException()
-            else:
-                rpt_df = None
-
-            # Tabulate the cycle and region info
-            all_cycle_dirs = []
-            for glob_str in ['cyc*', 'Cyc*']:
-                for pth in prefix.glob(glob_str):
-                    if pth.is_dir():
-                        all_cycle_dirs.append(str(pth.stem).lower())
-            cycles = []
-            regions = []
-            failures = []
-            for cyc_dir in all_cycle_dirs:
-                try:
-                    cyc_id, reg_id = _split_cycle_dir_string(cyc_dir)
-                    cycles.append(cyc_id)
-                    regions.append(reg_id)
-                except AssertionError as excp:
-                    failures.append(str(excp))
-            if failures:
-                rslt += failures
-                raise QuitNowException()
-            total_entries = len(cycles)
-            cycles = list(set(cycles))
-            cycles.sort()
-            regions = list(set(regions))
-            regions.sort()
-            failures = []
-            # First cycle must be 1
-            if cycles[0] != 1:
-                failures.append('Cycle numbering does not start at 1')
-            # First region must be 1
-            if regions[0] != 1:
-                failures.append('Region numbering does not start at 1')
-            # Cycle range must be contiguous ints
-            if cycles != list(range(cycles[0], cycles[-1]+1)):
-                failures.append('Cycle numbers are not contiguous')
-            # Region range must be contiguous ints
-            if regions != list(range(regions[0], regions[-1]+1)):
-                failures.append('Region numbers are not contiguous')
-            # All cycle, region pairs must be present
-            if len(cycles) * len(regions) != total_entries:
-                failures.append('Not all cycle/region pairs are present')
-            # Total number of channels / total number of cycles must be integer,
-            # excluding any HandE channels
-            total_channel_count = len(cn_df)
-            h_and_e_channel_count = len(cn_df[cn_df[0].str.startswith('HandE')])
-            channels_per_cycle = ((total_channel_count - h_and_e_channel_count)
-                                  / len(cycles))
-            if channels_per_cycle != int(channels_per_cycle):
-                failures.append('The number of channels per cycle is not constant')
-            if failures:
-                rslt += failures
-                raise QuitNowException()
-
-        except QuitNowException:
-            pass
-        return rslt
+                total_entries = len(cycles)
+                cycles = list(set(cycles))
+                cycles.sort()
+                regions = list(set(regions))
+                regions.sort()
+                failures = []
+                # First cycle must be 1
+                if cycles[0] != 1:
+                    failures.append('Cycle numbering does not start at 1')
+                # First region must be 1
+                if regions[0] != 1:
+                    failures.append('Region numbering does not start at 1')
+                # Cycle range must be contiguous ints
+                if cycles != list(range(cycles[0], cycles[-1] + 1)):
+                    failures.append('Cycle numbers are not contiguous')
+                # Region range must be contiguous ints
+                if regions != list(range(regions[0], regions[-1] + 1)):
+                    failures.append('Region numbers are not contiguous')
+                # All cycle, region pairs must be present
+                if len(cycles) * len(regions) != total_entries:
+                    failures.append('Not all cycle/region pairs are present')
+                # Total number of channels / total number of cycles must be integer,
+                # excluding any HandE channels
+                total_channel_count = len(cn_df)
+                h_and_e_channel_count = len(cn_df[cn_df[0].str.startswith('HandE')])
+                channels_per_cycle = (
+                    total_channel_count - h_and_e_channel_count
+                ) / len(cycles)
+                if channels_per_cycle != int(channels_per_cycle):
+                    failures.append('The number of channels per cycle is not constant')
+                if failures:
+                    rslt += failures
+                    raise QuitNowException()
+                rslts.append(rslt)
+
+            except QuitNowException:
+                pass
+        return rslts
diff --git a/src/ingest_validation_tests/codex_json_validator.py b/src/ingest_validation_tests/codex_json_validator.py
index 17b4ee9..5700cfe 100644
--- a/src/ingest_validation_tests/codex_json_validator.py
+++ b/src/ingest_validation_tests/codex_json_validator.py
@@ -1,10 +1,9 @@
-from typing import List
-from pathlib import Path
 import json
-
-from jsonschema import validate
+from pathlib import Path
+from typing import List
 
 from ingest_validation_tools.plugin_validator import Validator
+from jsonschema import validate
 
 
 class CodexJsonValidator(Validator):
@@ -20,10 +19,11 @@ def collect_errors(self, **kwargs) -> List[str]:
 
         rslt = []
         for glob_expr in ['**/dataset.json']:
-            for path in self.path.glob(glob_expr):
-                instance = json.loads(path.read_text())
-                try:
-                    validate(instance=instance, schema=schema)
-                except Exception as e:
-                    rslt.append(f'{path}: {e}')
+            for path in self.paths:
+                for file in path.glob(glob_expr):
+                    instance = json.loads(file.read_text())
+                    try:
+                        validate(instance=instance, schema=schema)
+                    except Exception as e:
+                        rslt.append(f'{file}: {e}')
         return rslt
diff --git a/src/ingest_validation_tests/fastq_validator.py b/src/ingest_validation_tests/fastq_validator.py
index 42101e3..dca270b 100644
--- a/src/ingest_validation_tests/fastq_validator.py
+++ b/src/ingest_validation_tests/fastq_validator.py
@@ -1,8 +1,8 @@
 from os import cpu_count
 from typing import List
 
-from ingest_validation_tools.plugin_validator import Validator
 from fastq_validator_logic import FASTQValidatorLogic, _log
+from ingest_validation_tools.plugin_validator import Validator
 
 
 class FASTQValidator(Validator):
@@ -10,8 +10,7 @@ class FASTQValidator(Validator):
     cost = 15.0
 
     def collect_errors(self, **kwargs) -> List[str]:
-        threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1
-        _log(f'Threading at {threads}')
+        _log(f'Threading at {self.threads}')
         validator = FASTQValidatorLogic(verbose=True)
-        validator.validate_fastq_files_in_path(self.path, threads)
+        validator.validate_fastq_files_in_path(self.paths, self.pool)
         return validator.errors
diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py
index 4043718..bb247ff 100644
--- a/src/ingest_validation_tests/fastq_validator_logic.py
+++ b/src/ingest_validation_tests/fastq_validator_logic.py
@@ -1,16 +1,16 @@
 import argparse
 import gzip
 import re
-
+from multiprocessing import Manager, pool
+from os import cpu_count
 from pathlib import Path
 from typing import Callable, List, TextIO
-from multiprocessing import Pool, Manager, Lock
 
 import fastq_utils
 
 
 def is_valid_filename(filename: str) -> bool:
-    return fastq_utils.FASTQ_PATTERN.fullmatch(filename)
+    return bool(fastq_utils.FASTQ_PATTERN.fullmatch(filename))
 
 
 def _open_fastq_file(file: Path) -> TextIO:
@@ -26,14 +26,16 @@ def _log(message: str) -> str:
 
 
 class Engine(object):
-    def __init__(self, validate_object, path: Path, lock):
+    def __init__(self, validate_object, paths: List[Path], lock):
         self.validate_object = validate_object
-        self.path = path
+        self.paths = paths
         self.lock = lock
 
     def __call__(self, fastq_file):
-        self.validate_object.validate_fastq_file(self.path / fastq_file, self.lock)
-        return next(iter(self.validate_object.errors), None)
+        for path in self.paths:
+            _log(f"Validating matching fastq files in {path.as_posix()}")
+            self.validate_object.validate_fastq_file(path / fastq_file, self.lock)
+            return next(iter(self.validate_object.errors), None)
 
 
 class FASTQValidatorLogic:
@@ -126,7 +128,7 @@ def _validate_fastq_line_4(self, line: str) -> List[str]:
     def validate_fastq_record(self, line: str, line_number: int) -> List[str]:
         line_index = line_number % 4 + 1
 
-        validator_method: Callable[[FASTQValidatorLogic, str], List] = \
+        validator_method: Callable[[FASTQValidatorLogic, str], List[str]] = \
             self._VALIDATE_FASTQ_LINE_METHODS[line_index]
 
         assert validator_method, \
@@ -202,17 +204,16 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None:
                 else:
                     self._file_prefix_counts[filename_prefix] = records_read
 
-    def validate_fastq_files_in_path(self, path: Path, threads: int) -> None:
+    def validate_fastq_files_in_path(self, paths: List[Path], pool: pool.Pool) -> None:
         data_found_one = []
-        _log(f"Validating matching fastq files in {path.as_posix()}")
-
-        dirs_and_files = fastq_utils.collect_fastq_files_by_directory(path)
+        dirs_and_files = {}
+        for path in paths:
+            dirs_and_files.update(fastq_utils.collect_fastq_files_by_directory(path))
         with Manager() as manager:
             lock = manager.Lock()
-            for directory, file_list in dirs_and_files.items():
+            for _, file_list in dirs_and_files.items():
                 try:
-                    pool = Pool(threads)
-                    engine = Engine(self, path, lock)
+                    engine = Engine(self, paths, lock)
                     data_output = pool.imap_unordered(engine, file_list)
                 except Exception as e:
                     _log(f'Error {e}')
@@ -233,13 +234,9 @@ def main():
     args = parser.parse_args()
 
     validator = FASTQValidatorLogic(True)
-
-    path: Path
-    for path in args.filepaths:
-        if path.is_dir():
-            validator.validate_fastq_files_in_path(path, 1)
-        else:
-            validator.validate_fastq_file(path, Lock())
+    threads = cpu_count() // 4 or 1
+    pool = Pool(threads)
+    validator.validate_fastq_files_in_path(args.filepaths, pool)
 
 
 if __name__ == '__main__':
diff --git a/src/ingest_validation_tests/gz_validator.py b/src/ingest_validation_tests/gz_validator.py
index cb25bb1..c3cb66f 100644
--- a/src/ingest_validation_tests/gz_validator.py
+++ b/src/ingest_validation_tests/gz_validator.py
@@ -12,20 +12,22 @@ def _log(message: str):
 
 
 class Engine(object):
-    def __call__(self, filename):
-        excluded = r'.*/*fastq.gz'
-        if re.search(excluded, filename.as_posix()):
-            return
-        try:
-            _log(f'Threaded {filename}')
-            with gzip.open(filename) as g_f:
-                while True:
-                    buf = g_f.read(1024*1024)
-                    if not buf:
-                        break
-        except Exception as e:
-            _log(f'{filename} is not a valid gzipped file {e}')
-            return f'{filename} is not a valid gzipped file'
+    def __call__(self, path_list):
+        for path in path_list:
+            for filename in path.glob('**/*.gz'):
+                excluded = r'.*/*fastq.gz'
+                if re.search(excluded, filename.as_posix()):
+                    return
+                try:
+                    _log(f'Threaded {filename}')
+                    with gzip.open(filename) as g_f:
+                        while True:
+                            buf = g_f.read(1024*1024)
+                            if not buf:
+                                break
+                except Exception as e:
+                    _log(f'{filename} is not a valid gzipped file {e}')
+                    return f'{filename} is not a valid gzipped file'
 
 
 class GZValidator(Validator):
@@ -34,17 +36,14 @@ class GZValidator(Validator):
 
     def collect_errors(self, **kwargs) -> List[str]:
         data_output2 = []
-        threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1
-        _log(f'Threading at {threads}')
-        for glob_expr in ['**/*.gz']:
-            try:
-                pool = Pool(threads)
-                engine = Engine()
-                data_output = pool.imap_unordered(engine, self.path.glob(glob_expr))
-            except Exception as e:
-                _log(f'Error {e}')
-            else:
-                pool.close()
-                pool.join()
-                [data_output2.append(output) for output in data_output if output]
+        _log(f'Threading at {self.threads}')
+        try:
+            engine = Engine()
+            data_output = self.pool.imap_unordered(engine, self.paths)
+        except Exception as e:
+            _log(f'Error {e}')
+        else:
+            pool.close()
+            pool.join()
+            [data_output2.append(output) for output in data_output if output]
         return data_output2
diff --git a/src/ingest_validation_tests/ome_tiff_validator.py b/src/ingest_validation_tests/ome_tiff_validator.py
index fb05133..063fd7d 100644
--- a/src/ingest_validation_tests/ome_tiff_validator.py
+++ b/src/ingest_validation_tests/ome_tiff_validator.py
@@ -1,23 +1,65 @@
 from typing import List
+from pathlib import Path
 
-import xmlschema
 import tifffile
+import xmlschema
 from ingest_validation_tools.plugin_validator import Validator
 
+
+# def _check_ome_tiff_file(paths: str):
+#     rslt_list = []
+#     for path in paths:
+#         try:
+#             with tifffile.TiffFile(path) as tf:
+#                 xml_document = xmlschema.XmlDocument(tf.ome_metadata)
+#             if not xml_document.schema.is_valid(xml_document):
+#                 return f"{path} is not a valid OME.TIFF file"
+#         except Exception as excp:
+#             rslt_list.append(f"{path} is not a valid OME.TIFF file: {excp}")
+#     return rslt_list
+#
+# def _glob_paths(path: str):
+#     filenames_to_test = []
+#     for glob_expr in ['**/*.ome.tif', '**/*.ome.tiff', '**/*.OME.TIFF', '**/*.OME.TIF']:
+#         for file in path.glob(glob_expr):
+#             filenames_to_test.append(file)
+#     return filenames_to_test
+#
+#
 class OmeTiffValidator(Validator):
     description = "Recursively test all ome-tiff files for validity"
     cost = 1.0
+
     def collect_errors(self, **kwargs) -> List[str]:
+        del kwargs
         rslt = []
         for glob_expr in ['**/*.ome.tif', '**/*.ome.tiff', '**/*.OME.TIFF', '**/*.OME.TIF']:
-            for path in self.path.glob(glob_expr):
-                try:
-                    with tifffile.TiffFile(path) as tf:
-                        xml_document = xmlschema.XmlDocument(tf.ome_metadata)
-                    if not xml_document.schema.is_valid(xml_document):
-                        rslt.append(f'{path} is not a valid OME.TIFF file')
-                except Exception as excp:
-                    rslt.append(f'{path} is not a valid OME.TIFF file: {excp}')
+            for path in self.paths:
+                for file in path.glob(glob_expr):
+                    try:
+                        with tifffile.TiffFile(file) as tf:
+                            xml_document = xmlschema.XmlDocument(tf.ome_metadata)
+                        if not xml_document.schema.is_valid(xml_document):
+                            rslt.append(f'{file} is not a valid OME.TIFF file')
+                    except Exception as excp:
+                        rslt.append(f'{file} is not a valid OME.TIFF file: {excp}')
         return rslt
-
-
+    # def collect_errors(self, **kwargs) -> List[str]:
+    #     del kwargs
+    #     filenames_to_test = list(
+    #         file
+    #         for file in self.pool.imap_unordered(_glob_paths, self.paths)
+    #         if file is not None
+    #     )
+    #     # TODO: this is awful
+    #     return list(
+    #         x
+    #         for y in list(
+    #             rslt
+    #             for rslt in self.pool.imap_unordered(
+    #                 _check_ome_tiff_file, filenames_to_test
+    #             )
+    #             if rslt is not None
+    #         )
+    #         for x in y
+    #     )
diff --git a/src/ingest_validation_tests/publication_validator.py b/src/ingest_validation_tests/publication_validator.py
index 9399f2c..4c7ccc1 100644
--- a/src/ingest_validation_tests/publication_validator.py
+++ b/src/ingest_validation_tests/publication_validator.py
@@ -19,46 +19,49 @@ class PublicationValidator(Validator):
     cost = 1.0
     base_url_re = r'(\s*\{\{\s*base_url\s*\}\})/(.*)'
     url_re = r'[Uu][Rr][Ll]'
+
     def collect_errors(self, **kwargs) -> List[str]:
         """
         Return the errors found by this validator
         """
+        del kwargs
         if self.assay_type != 'Publication':
             return []  # We only test Publication data
         rslt = []
-        try:
-            vignette_path = self.path / 'vignettes'
-            assert vignette_path.is_dir(), 'vignettes not found or not a directory'
-            for this_vignette_path in vignette_path.glob('*'):
-                assert this_vignette_path.is_dir(), (f"Found the non-dir {this_vignette_path}"
-                                                     " in vignettes")
-                this_vignette_all_paths = set(this_vignette_path.glob('*'))
-                if not all(pth.is_file() for pth in this_vignette_all_paths):
-                    raise AssertionError('Found a subdirectory in a vignette')
-                md_found = False
-                vig_figures = []
-                for md_path in this_vignette_path.glob('*.md'):
-                    if md_found:
-                        raise AssertionError('A vignette has more than one markdown file')
-                    else:
-                        md_found = True
-                    vig_fm = frontmatter.loads(md_path.read_text())
-                    for key in ['name', 'figures']:
-                        assert key in vig_fm.metadata, ('vignette markdown is incorrectly'
-                                                        f' formatted or has no {key}')
-                    for fig_dict in vig_fm.metadata['figures']:
-                        assert 'file' in fig_dict, 'figure dict does not reference a file'
-                        assert 'name' in fig_dict, 'figure dict does not provide a name'
-                        vig_figures.append(fig_dict['file'])
-                    this_vignette_all_paths.remove(md_path)
-                    for fname in vig_figures:
-                        rslt.extend(self.validate_vitessce_config(this_vignette_path / fname))
-                        this_vignette_all_paths.remove(this_vignette_path / fname)
-                assert not this_vignette_all_paths, ('unexpected files in vignette:'
-                                                     f' {list(str(elt) for elt in this_vignette_all_paths)}')
+        for path in self.paths:
+            try:
+                vignette_path = path / 'vignettes'
+                assert vignette_path.is_dir(), 'vignettes not found or not a directory'
+                for this_vignette_path in vignette_path.glob('*'):
+                    assert this_vignette_path.is_dir(), (f"Found the non-dir {this_vignette_path}"
+                                                        " in vignettes")
+                    this_vignette_all_paths = set(this_vignette_path.glob('*'))
+                    if not all(pth.is_file() for pth in this_vignette_all_paths):
+                        raise AssertionError('Found a subdirectory in a vignette')
+                    md_found = False
+                    vig_figures = []
+                    for md_path in this_vignette_path.glob('*.md'):
+                        if md_found:
+                            raise AssertionError('A vignette has more than one markdown file')
+                        else:
+                            md_found = True
+                        vig_fm = frontmatter.loads(md_path.read_text())
+                        for key in ['name', 'figures']:
+                            assert key in vig_fm.metadata, ('vignette markdown is incorrectly'
+                                                            f' formatted or has no {key}')
+                        for fig_dict in vig_fm.metadata['figures']:
+                            assert 'file' in fig_dict, 'figure dict does not reference a file'
+                            assert 'name' in fig_dict, 'figure dict does not provide a name'
+                            vig_figures.append(fig_dict['file'])
+                        this_vignette_all_paths.remove(md_path)
+                        for fname in vig_figures:
+                            rslt.extend(self.validate_vitessce_config(this_vignette_path / fname, path))
+                            this_vignette_all_paths.remove(this_vignette_path / fname)
+                    assert not this_vignette_all_paths, ('unexpected files in vignette:'
+                                                        f' {list(str(elt) for elt in this_vignette_all_paths)}')
 
-        except AssertionError as excp:
-            rslt.append(str(excp))
+            except AssertionError as excp:
+                rslt.append(str(excp))
 
         return rslt
 
@@ -82,16 +85,16 @@ def url_search_iter(self, root):
         else:
             raise AssertionError(f"what is {root} of type {type(root)} ?")
 
-    def validate_vitessce_config(self, json_path):
+    def validate_vitessce_config(self, json_path, path):
         rslt = []
         with open(json_path) as f:
             dct = json.load(f)
-            for key, val in self.url_search_iter(dct):
+            for _, val in self.url_search_iter(dct):
                 try:
                     match = re.match(self.base_url_re, val)
                     if match:  # it starts with {{ base_url }}
                         extra_url = match.group(2)
-                        data_path = self.path / 'data' / extra_url
+                        data_path = path / 'data' / extra_url
                         assert data_path.exists(), ("expected data file"
                                                     f" {Path('data') / extra_url} is absent")
 
diff --git a/src/ingest_validation_tests/tiff_validator.py b/src/ingest_validation_tests/tiff_validator.py
index fc10da1..2b22ec1 100644
--- a/src/ingest_validation_tests/tiff_validator.py
+++ b/src/ingest_validation_tests/tiff_validator.py
@@ -1,6 +1,7 @@
-from os import cpu_count
 from multiprocessing import Pool
-from typing import List
+from os import cpu_count
+from pathlib import Path
+from typing import List, Optional
 
 import tifffile
 from ingest_validation_tools.plugin_validator import Validator
@@ -8,8 +9,12 @@
 # monkey patch tifffile to raise an exception every time a warning
 # is logged
 original_log_warning = tifffile.tifffile.log_warning
+
+
 def my_log_warning(msg, *args, **kwargs):
     raise RuntimeError(f"{msg.format(*args, **kwargs)}")
+
+
 tifffile.tifffile.log_warning = my_log_warning
 
 
@@ -28,16 +33,59 @@ def _check_tiff_file(path: str) -> str or None:
         return f"{path} is not a valid TIFF file: {excp}"
 
 
+# def _check_tiff_file(paths: List) -> Optional[str]:
+#     rslt_list = []
+#     for path in paths:
+#         try:
+#             with tifffile.TiffFile(path) as tfile:
+#                 for page in tfile.pages:
+#                     _ = page.asarray()  # force decompression
+#             return None
+#         except Exception as excp:
+#             _log(f"{str(path)} is not a valid TIFF file: {excp}")
+#             rslt_list.append(f"{path} is not a valid TIFF file: {excp}")
+#     return rslt_list
+#
+
+
+# def _glob_paths(path: str):
+#     filenames_to_test = []
+#     for glob_expr in ["**/*.tif", "**/*.tiff", "**/*.TIFF", "**/*.TIF"]:
+#         for file in path.glob(glob_expr):
+#             filenames_to_test.append(file)
+#     return filenames_to_test
+
+
 class TiffValidator(Validator):
     description = "Recursively test all tiff files that are not ome.tiffs for validity"
     cost = 1.0
+
     def collect_errors(self, **kwargs) -> List[str]:
-        threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1
-        pool = Pool(threads)
+        del kwargs
         filenames_to_test = []
-        for glob_expr in ['**/*.tif', '**/*.tiff', '**/*.TIFF', '**/*.TIF']:
-            for path in self.path.glob(glob_expr):
-                filenames_to_test.append(path)
-        return list(rslt for rslt in pool.imap_unordered(_check_tiff_file,
-                                                         filenames_to_test)
-                    if rslt is not None)
+        for glob_expr in ["**/*.tif", "**/*.tiff", "**/*.TIFF", "**/*.TIF"]:
+            for path in self.paths:
+                for file in path.glob(glob_expr):
+                    filenames_to_test.append(file)
+        return list(
+            rslt
+            for rslt in self.pool.imap_unordered(_check_tiff_file, filenames_to_test)
+            if rslt is not None
+        )
+        # filenames_to_test = list(
+        #     file
+        #     for file in self.pool.imap_unordered(_glob_paths, self.paths)
+        #     if file is not None
+        # )
+        # # TODO: this is awful
+        # return list(
+        #     x
+        #     for y in list(
+        #         rslt
+        #         for rslt in self.pool.imap_unordered(
+        #             _check_tiff_file, filenames_to_test
+        #         )
+        #         if rslt is not None
+        #     )
+        #     for x in y
+        # )

From 51c0d7381328903597d5f5a23209b27fc2d65488 Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Mon, 22 Jan 2024 14:39:35 -0500
Subject: [PATCH 02/30] fixing mistake in gz_validator

---
 src/ingest_validation_tests/gz_validator.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/ingest_validation_tests/gz_validator.py b/src/ingest_validation_tests/gz_validator.py
index c3cb66f..c552a7d 100644
--- a/src/ingest_validation_tests/gz_validator.py
+++ b/src/ingest_validation_tests/gz_validator.py
@@ -1,6 +1,4 @@
-from os import cpu_count
 import re
-from multiprocessing import Pool
 from typing import List
 
 import gzip
@@ -43,7 +41,7 @@ def collect_errors(self, **kwargs) -> List[str]:
         except Exception as e:
             _log(f'Error {e}')
         else:
-            pool.close()
-            pool.join()
+            self.pool.close()
+            self.pool.join()
             [data_output2.append(output) for output in data_output if output]
         return data_output2

From f5b16d4381484a367c3f1978481ea6f2c0d9ae3b Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Mon, 22 Jan 2024 15:01:20 -0500
Subject: [PATCH 03/30] fixing some mistakes

---
 .../fastq_validator_logic.py                  |  2 +-
 src/ingest_validation_tests/gz_validator.py   | 31 +++++++++----------
 2 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py
index bb247ff..19f0b4e 100644
--- a/src/ingest_validation_tests/fastq_validator_logic.py
+++ b/src/ingest_validation_tests/fastq_validator_logic.py
@@ -1,7 +1,7 @@
 import argparse
 import gzip
 import re
-from multiprocessing import Manager, pool
+from multiprocessing import Manager, Pool, pool
 from os import cpu_count
 from pathlib import Path
 from typing import Callable, List, TextIO
diff --git a/src/ingest_validation_tests/gz_validator.py b/src/ingest_validation_tests/gz_validator.py
index c552a7d..389ba08 100644
--- a/src/ingest_validation_tests/gz_validator.py
+++ b/src/ingest_validation_tests/gz_validator.py
@@ -10,22 +10,21 @@ def _log(message: str):
 
 
 class Engine(object):
-    def __call__(self, path_list):
-        for path in path_list:
-            for filename in path.glob('**/*.gz'):
-                excluded = r'.*/*fastq.gz'
-                if re.search(excluded, filename.as_posix()):
-                    return
-                try:
-                    _log(f'Threaded {filename}')
-                    with gzip.open(filename) as g_f:
-                        while True:
-                            buf = g_f.read(1024*1024)
-                            if not buf:
-                                break
-                except Exception as e:
-                    _log(f'{filename} is not a valid gzipped file {e}')
-                    return f'{filename} is not a valid gzipped file'
+    def __call__(self, path):
+        for filename in path.glob('**/*.gz'):
+            excluded = r'.*/*fastq.gz'
+            if re.search(excluded, filename.as_posix()):
+                return
+            try:
+                _log(f'Threaded {filename}')
+                with gzip.open(filename) as g_f:
+                    while True:
+                        buf = g_f.read(1024*1024)
+                        if not buf:
+                            break
+            except Exception as e:
+                _log(f'{filename} is not a valid gzipped file {e}')
+                return f'{filename} is not a valid gzipped file'
 
 
 class GZValidator(Validator):

From 8eff29434b077ff37616bacdbe65959c2e61734b Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Mon, 22 Jan 2024 15:49:05 -0500
Subject: [PATCH 04/30] accounting for nested dict structure in fastq

---
 .../fastq_validator_logic.py                  | 135 ++++++++++--------
 1 file changed, 77 insertions(+), 58 deletions(-)

diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py
index 19f0b4e..e9569d7 100644
--- a/src/ingest_validation_tests/fastq_validator_logic.py
+++ b/src/ingest_validation_tests/fastq_validator_logic.py
@@ -1,6 +1,8 @@
 import argparse
 import gzip
+import logging
 import re
+from collections import defaultdict
 from multiprocessing import Manager, Pool, pool
 from os import cpu_count
 from pathlib import Path
@@ -14,10 +16,7 @@ def is_valid_filename(filename: str) -> bool:
 
 
 def _open_fastq_file(file: Path) -> TextIO:
-    return (
-        gzip.open(file, 'rt') if file.name.endswith('.gz')
-        else file.open()
-    )
+    return gzip.open(file, "rt") if file.name.endswith(".gz") else file.open()
 
 
 def _log(message: str) -> str:
@@ -56,13 +55,13 @@ class FASTQValidatorLogic:
     # This pattern seeks out the string that includes the lane number (since
     # that is expected to be present to help anchor the prefix) that comes
     # before any of _I1, _I2, _R1, or _R2.
-    _FASTQ_FILE_PREFIX_REGEX = re.compile(r'(.+_L\d+.*)_[IR][12][._]')
+    _FASTQ_FILE_PREFIX_REGEX = re.compile(r"(.+_L\d+.*)_[IR][12][._]")
 
-    _FASTQ_LINE_2_VALID_CHARS = 'ACGNT'
+    _FASTQ_LINE_2_VALID_CHARS = "ACGNT"
 
     def __init__(self, verbose=False):
         self.errors: List[str] = []
-        self._filename = ''
+        self._filename = ""
         self._line_number = 0
         self._file_record_counts = Manager().dict()
         self._file_prefix_counts = Manager().dict()
@@ -84,7 +83,7 @@ def _format_error(self, error: str) -> str:
         return message
 
     def _validate_fastq_line_1(self, line: str) -> List[str]:
-        if not line or line[0] != '@':
+        if not line or line[0] != "@":
             return ["Line does not begin with '@'."]
 
         return []
@@ -93,46 +92,54 @@ def _validate_fastq_line_2(self, line: str) -> List[str]:
         self._line_2_length = len(line)
         self._last_line_2_number = self._line_number
 
-        invalid_chars = ''.join(
-            c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS)
+        invalid_chars = "".join(
+            c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS
+        )
         if invalid_chars:
             return [f"Line contains invalid character(s): {invalid_chars}"]
 
         return []
 
     def _validate_fastq_line_3(self, line: str) -> List[str]:
-        if not line or line[0] != '+':
+        if not line or line[0] != "+":
             return ["Line does not begin with '+'."]
 
         return []
 
     def _validate_fastq_line_4(self, line: str) -> List[str]:
         errors: List[str] = []
-        invalid_chars = ''.join(c for c in line if not 33 <= ord(c) <= 126)
+        invalid_chars = "".join(c for c in line if not 33 <= ord(c) <= 126)
         if invalid_chars:
-            errors.append("Line contains invalid quality character(s): "
-                          f'"{invalid_chars}"')
+            errors.append(
+                "Line contains invalid quality character(s): " f'"{invalid_chars}"'
+            )
 
         if len(line) != self._line_2_length:
-            errors.append(f"Line contains {len(line)} characters which "
-                          f"does not match line {self._last_line_2_number}'s "
-                          f"{self._line_2_length} characters.")
+            errors.append(
+                f"Line contains {len(line)} characters which "
+                f"does not match line {self._last_line_2_number}'s "
+                f"{self._line_2_length} characters."
+            )
 
         return errors
 
-    _VALIDATE_FASTQ_LINE_METHODS = {1: _validate_fastq_line_1,
-                                    2: _validate_fastq_line_2,
-                                    3: _validate_fastq_line_3,
-                                    4: _validate_fastq_line_4}
+    _VALIDATE_FASTQ_LINE_METHODS = {
+        1: _validate_fastq_line_1,
+        2: _validate_fastq_line_2,
+        3: _validate_fastq_line_3,
+        4: _validate_fastq_line_4,
+    }
 
     def validate_fastq_record(self, line: str, line_number: int) -> List[str]:
         line_index = line_number % 4 + 1
 
-        validator_method: Callable[[FASTQValidatorLogic, str], List[str]] = \
-            self._VALIDATE_FASTQ_LINE_METHODS[line_index]
+        validator_method: Callable[
+            [FASTQValidatorLogic, str], List[str]
+        ] = self._VALIDATE_FASTQ_LINE_METHODS[line_index]
 
-        assert validator_method, \
-            f"No validator method defined for record index {line_index}"
+        assert (
+            validator_method
+        ), f"No validator method defined for record index {line_index}"
 
         return validator_method(self, line)
 
@@ -144,8 +151,8 @@ def validate_fastq_stream(self, fastq_data: TextIO) -> int:
         for line_count, line in enumerate(fastq_data):
             self._line_number = line_count + 1
             self.errors.extend(
-                self._format_error(error) for error in
-                self.validate_fastq_record(line.rstrip(), line_count)
+                self._format_error(error)
+                for error in self.validate_fastq_record(line.rstrip(), line_count)
             )
 
         return line_count + 1
@@ -156,9 +163,11 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None:
 
         if not is_valid_filename(fastq_file.name):
             # If we don't like the filename, don't bother reading the contents.
-            self.errors.append(_log(
-                "Filename does not have proper format "
-                "and will not be processed"))
+            self.errors.append(
+                _log(
+                    "Filename does not have proper format " "and will not be processed"
+                )
+            )
             return
 
         self._line_number = 0
@@ -169,14 +178,16 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None:
                 records_read = self.validate_fastq_stream(fastq_data)
             # TODO: Add gzip.BadGzipFile when Python 3.8 is available
         except IOError:
-            self.errors.append(
-                self._format_error("Unable to open FASTQ data file."))
+            self.errors.append(self._format_error("Unable to open FASTQ data file."))
             return
 
         if fastq_file.name in self._file_record_counts.keys():
-            self.errors.append(_log(
-                f"{fastq_file.name} has been found multiple times during this "
-                "validation."))
+            self.errors.append(
+                _log(
+                    f"{fastq_file.name} has been found multiple times during this "
+                    "validation."
+                )
+            )
         self._file_record_counts[fastq_file.name] = records_read
 
         match = self._FASTQ_FILE_PREFIX_REGEX.match(fastq_file.name)
@@ -189,47 +200,55 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None:
                         # Find a file we've validated already that matches this
                         # prefix.
                         extant_files = [
-                            filename for filename, record_count
-                            in self._file_record_counts.items()
-                            if record_count == extant_count and filename.startswith(filename_prefix)
+                            filename
+                            for filename, record_count in self._file_record_counts.items()
+                            if record_count == extant_count
+                            and filename.startswith(filename_prefix)
                         ]
                         # Based on how the dictionaries are created, there should
                         # always be at least one matching filename.
                         assert extant_files
 
-                        self.errors.append(_log(
-                            f"{fastq_file.name} ({records_read} lines) "
-                            f"does not match length of {extant_files[0]} "
-                            f"({extant_count} lines)."))
+                        self.errors.append(
+                            _log(
+                                f"{fastq_file.name} ({records_read} lines) "
+                                f"does not match length of {extant_files[0]} "
+                                f"({extant_count} lines)."
+                            )
+                        )
                 else:
                     self._file_prefix_counts[filename_prefix] = records_read
 
     def validate_fastq_files_in_path(self, paths: List[Path], pool: pool.Pool) -> None:
         data_found_one = []
-        dirs_and_files = {}
+        dirs_and_files = defaultdict(dict)
         for path in paths:
-            dirs_and_files.update(fastq_utils.collect_fastq_files_by_directory(path))
+            dirs_and_files[path] = fastq_utils.collect_fastq_files_by_directory(path)
+            logging.info(f"Added files from {path} to dirs_and_files: {dirs_and_files}")
         with Manager() as manager:
             lock = manager.Lock()
-            for _, file_list in dirs_and_files.items():
-                try:
-                    engine = Engine(self, paths, lock)
-                    data_output = pool.imap_unordered(engine, file_list)
-                except Exception as e:
-                    _log(f'Error {e}')
-                else:
-                    pool.close()
-                    pool.join()
-                    [data_found_one.append(output) for output in data_output if output]
+            for path, rel_path in dirs_and_files.items():
+                for _, file_list in rel_path.items():
+                    try:
+                        logging.info(f"Passing paths {paths} to engine.")
+                        engine = Engine(self, paths, lock)
+                        data_output = pool.imap_unordered(engine, file_list)
+                    except Exception as e:
+                        _log(f"Error {e}")
+                    else:
+                        pool.close()
+                        pool.join()
+                        [data_found_one.append(output) for output in data_output if output]
 
         if len(data_found_one) > 0:
             self.errors = data_found_one
 
 
 def main():
-    parser = argparse.ArgumentParser(description='Validate FASTQ files.')
-    parser.add_argument('filepaths', type=Path, nargs='+',
-                        help="Files to validate for FASTQ syntax")
+    parser = argparse.ArgumentParser(description="Validate FASTQ files.")
+    parser.add_argument(
+        "filepaths", type=Path, nargs="+", help="Files to validate for FASTQ syntax"
+    )
 
     args = parser.parse_args()
 
@@ -239,5 +258,5 @@ def main():
     validator.validate_fastq_files_in_path(args.filepaths, pool)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()

From ff856b5a07731ada6e7fb0f8d95400f2e6321141 Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Tue, 23 Jan 2024 11:59:51 -0500
Subject: [PATCH 05/30] changing threading back to original paradigm

---
 .../fastq_validator.py                        |   5 +-
 .../fastq_validator_logic.py                  | 117 ++++++++----------
 src/ingest_validation_tests/gz_validator.py   |  12 +-
 .../ome_tiff_validator.py                     |  39 ------
 src/ingest_validation_tests/tiff_validator.py |  53 +-------
 5 files changed, 68 insertions(+), 158 deletions(-)

diff --git a/src/ingest_validation_tests/fastq_validator.py b/src/ingest_validation_tests/fastq_validator.py
index dca270b..d31335f 100644
--- a/src/ingest_validation_tests/fastq_validator.py
+++ b/src/ingest_validation_tests/fastq_validator.py
@@ -10,7 +10,8 @@ class FASTQValidator(Validator):
     cost = 15.0
 
     def collect_errors(self, **kwargs) -> List[str]:
-        _log(f'Threading at {self.threads}')
+        threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1
+        _log(f"Threading at {threads}")
         validator = FASTQValidatorLogic(verbose=True)
-        validator.validate_fastq_files_in_path(self.paths, self.pool)
+        validator.validate_fastq_files_in_path(self.paths, threads)
         return validator.errors
diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py
index e9569d7..315d2ae 100644
--- a/src/ingest_validation_tests/fastq_validator_logic.py
+++ b/src/ingest_validation_tests/fastq_validator_logic.py
@@ -16,7 +16,10 @@ def is_valid_filename(filename: str) -> bool:
 
 
 def _open_fastq_file(file: Path) -> TextIO:
-    return gzip.open(file, "rt") if file.name.endswith(".gz") else file.open()
+    return (
+        gzip.open(file, 'rt') if file.name.endswith('.gz')
+        else file.open()
+    )
 
 
 def _log(message: str) -> str:
@@ -31,10 +34,12 @@ def __init__(self, validate_object, paths: List[Path], lock):
         self.lock = lock
 
     def __call__(self, fastq_file):
+        errors = []
         for path in self.paths:
             _log(f"Validating matching fastq files in {path.as_posix()}")
             self.validate_object.validate_fastq_file(path / fastq_file, self.lock)
-            return next(iter(self.validate_object.errors), None)
+            errors.append(next(iter(self.validate_object.errors), None))
+        return errors
 
 
 class FASTQValidatorLogic:
@@ -55,13 +60,13 @@ class FASTQValidatorLogic:
     # This pattern seeks out the string that includes the lane number (since
     # that is expected to be present to help anchor the prefix) that comes
     # before any of _I1, _I2, _R1, or _R2.
-    _FASTQ_FILE_PREFIX_REGEX = re.compile(r"(.+_L\d+.*)_[IR][12][._]")
+    _FASTQ_FILE_PREFIX_REGEX = re.compile(r'(.+_L\d+.*)_[IR][12][._]')
 
-    _FASTQ_LINE_2_VALID_CHARS = "ACGNT"
+    _FASTQ_LINE_2_VALID_CHARS = 'ACGNT'
 
     def __init__(self, verbose=False):
         self.errors: List[str] = []
-        self._filename = ""
+        self._filename = ''
         self._line_number = 0
         self._file_record_counts = Manager().dict()
         self._file_prefix_counts = Manager().dict()
@@ -83,7 +88,7 @@ def _format_error(self, error: str) -> str:
         return message
 
     def _validate_fastq_line_1(self, line: str) -> List[str]:
-        if not line or line[0] != "@":
+        if not line or line[0] != '@':
             return ["Line does not begin with '@'."]
 
         return []
@@ -92,54 +97,45 @@ def _validate_fastq_line_2(self, line: str) -> List[str]:
         self._line_2_length = len(line)
         self._last_line_2_number = self._line_number
 
-        invalid_chars = "".join(
-            c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS
-        )
+        invalid_chars = ''.join(
+            c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS)
         if invalid_chars:
             return [f"Line contains invalid character(s): {invalid_chars}"]
 
         return []
 
     def _validate_fastq_line_3(self, line: str) -> List[str]:
-        if not line or line[0] != "+":
+        if not line or line[0] != '+':
             return ["Line does not begin with '+'."]
 
         return []
 
     def _validate_fastq_line_4(self, line: str) -> List[str]:
         errors: List[str] = []
-        invalid_chars = "".join(c for c in line if not 33 <= ord(c) <= 126)
+        invalid_chars = ''.join(c for c in line if not 33 <= ord(c) <= 126)
         if invalid_chars:
-            errors.append(
-                "Line contains invalid quality character(s): " f'"{invalid_chars}"'
-            )
+            errors.append("Line contains invalid quality character(s): "
+                          f'"{invalid_chars}"')
 
         if len(line) != self._line_2_length:
-            errors.append(
-                f"Line contains {len(line)} characters which "
-                f"does not match line {self._last_line_2_number}'s "
-                f"{self._line_2_length} characters."
-            )
-
+            errors.append(f"Line contains {len(line)} characters which "
+                          f"does not match line {self._last_line_2_number}'s "
+                          f"{self._line_2_length} characters.")
         return errors
 
-    _VALIDATE_FASTQ_LINE_METHODS = {
-        1: _validate_fastq_line_1,
-        2: _validate_fastq_line_2,
-        3: _validate_fastq_line_3,
-        4: _validate_fastq_line_4,
-    }
+    _VALIDATE_FASTQ_LINE_METHODS = {1: _validate_fastq_line_1,
+                                    2: _validate_fastq_line_2,
+                                    3: _validate_fastq_line_3,
+                                    4: _validate_fastq_line_4}
 
     def validate_fastq_record(self, line: str, line_number: int) -> List[str]:
         line_index = line_number % 4 + 1
 
-        validator_method: Callable[
-            [FASTQValidatorLogic, str], List[str]
-        ] = self._VALIDATE_FASTQ_LINE_METHODS[line_index]
+        validator_method: Callable[[FASTQValidatorLogic, str], List[str]] = \
+            self._VALIDATE_FASTQ_LINE_METHODS[line_index]
 
-        assert (
-            validator_method
-        ), f"No validator method defined for record index {line_index}"
+        assert validator_method, \
+            f"No validator method defined for record index {line_index}"
 
         return validator_method(self, line)
 
@@ -151,8 +147,8 @@ def validate_fastq_stream(self, fastq_data: TextIO) -> int:
         for line_count, line in enumerate(fastq_data):
             self._line_number = line_count + 1
             self.errors.extend(
-                self._format_error(error)
-                for error in self.validate_fastq_record(line.rstrip(), line_count)
+                self._format_error(error) for error in
+                self.validate_fastq_record(line.rstrip(), line_count)
             )
 
         return line_count + 1
@@ -163,11 +159,9 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None:
 
         if not is_valid_filename(fastq_file.name):
             # If we don't like the filename, don't bother reading the contents.
-            self.errors.append(
-                _log(
-                    "Filename does not have proper format " "and will not be processed"
-                )
-            )
+            self.errors.append(_log(
+                "Filename does not have proper format "
+                "and will not be processed"))
             return
 
         self._line_number = 0
@@ -178,16 +172,14 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None:
                 records_read = self.validate_fastq_stream(fastq_data)
             # TODO: Add gzip.BadGzipFile when Python 3.8 is available
         except IOError:
-            self.errors.append(self._format_error("Unable to open FASTQ data file."))
+            self.errors.append(
+                self._format_error("Unable to open FASTQ data file."))
             return
 
         if fastq_file.name in self._file_record_counts.keys():
-            self.errors.append(
-                _log(
-                    f"{fastq_file.name} has been found multiple times during this "
-                    "validation."
-                )
-            )
+            self.errors.append(_log(
+                f"{fastq_file.name} has been found multiple times during this "
+                "validation."))
         self._file_record_counts[fastq_file.name] = records_read
 
         match = self._FASTQ_FILE_PREFIX_REGEX.match(fastq_file.name)
@@ -200,22 +192,18 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None:
                         # Find a file we've validated already that matches this
                         # prefix.
                         extant_files = [
-                            filename
-                            for filename, record_count in self._file_record_counts.items()
-                            if record_count == extant_count
-                            and filename.startswith(filename_prefix)
+                            filename for filename, record_count
+                            in self._file_record_counts.items()
+                            if record_count == extant_count and filename.startswith(filename_prefix)
                         ]
                         # Based on how the dictionaries are created, there should
                         # always be at least one matching filename.
                         assert extant_files
 
-                        self.errors.append(
-                            _log(
-                                f"{fastq_file.name} ({records_read} lines) "
-                                f"does not match length of {extant_files[0]} "
-                                f"({extant_count} lines)."
-                            )
-                        )
+                        self.errors.append(_log(
+                            f"{fastq_file.name} ({records_read} lines) "
+                            f"does not match length of {extant_files[0]} "
+                            f"({extant_count} lines)."))
                 else:
                     self._file_prefix_counts[filename_prefix] = records_read
 
@@ -224,7 +212,7 @@ def validate_fastq_files_in_path(self, paths: List[Path], pool: pool.Pool) -> No
         dirs_and_files = defaultdict(dict)
         for path in paths:
             dirs_and_files[path] = fastq_utils.collect_fastq_files_by_directory(path)
-            logging.info(f"Added files from {path} to dirs_and_files: {dirs_and_files}")
+            _log(f"Added files from {path} to dirs_and_files: {dirs_and_files}")
         with Manager() as manager:
             lock = manager.Lock()
             for path, rel_path in dirs_and_files.items():
@@ -245,18 +233,15 @@ def validate_fastq_files_in_path(self, paths: List[Path], pool: pool.Pool) -> No
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Validate FASTQ files.")
-    parser.add_argument(
-        "filepaths", type=Path, nargs="+", help="Files to validate for FASTQ syntax"
-    )
+    parser = argparse.ArgumentParser(description='Validate FASTQ files.')
+    parser.add_argument('filepaths', type=Path, nargs='+',
+                        help="Files to validate for FASTQ syntax")
 
     args = parser.parse_args()
 
     validator = FASTQValidatorLogic(True)
-    threads = cpu_count() // 4 or 1
-    pool = Pool(threads)
-    validator.validate_fastq_files_in_path(args.filepaths, pool)
+    validator.validate_fastq_files_in_path(args.filepaths)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     main()
diff --git a/src/ingest_validation_tests/gz_validator.py b/src/ingest_validation_tests/gz_validator.py
index 389ba08..e6ca6d5 100644
--- a/src/ingest_validation_tests/gz_validator.py
+++ b/src/ingest_validation_tests/gz_validator.py
@@ -1,3 +1,5 @@
+from multiprocessing import Pool
+from os import cpu_count
 import re
 from typing import List
 
@@ -33,14 +35,16 @@ class GZValidator(Validator):
 
     def collect_errors(self, **kwargs) -> List[str]:
         data_output2 = []
-        _log(f'Threading at {self.threads}')
+        threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1
+        _log(f'Threading at {threads}')
         try:
+            pool = Pool(threads)
             engine = Engine()
-            data_output = self.pool.imap_unordered(engine, self.paths)
+            data_output = pool.imap_unordered(engine, self.paths)
         except Exception as e:
             _log(f'Error {e}')
         else:
-            self.pool.close()
-            self.pool.join()
+            pool.close()
+            pool.join()
             [data_output2.append(output) for output in data_output if output]
         return data_output2
diff --git a/src/ingest_validation_tests/ome_tiff_validator.py b/src/ingest_validation_tests/ome_tiff_validator.py
index 063fd7d..2979035 100644
--- a/src/ingest_validation_tests/ome_tiff_validator.py
+++ b/src/ingest_validation_tests/ome_tiff_validator.py
@@ -6,26 +6,6 @@
 from ingest_validation_tools.plugin_validator import Validator
 
 
-# def _check_ome_tiff_file(paths: str):
-#     rslt_list = []
-#     for path in paths:
-#         try:
-#             with tifffile.TiffFile(path) as tf:
-#                 xml_document = xmlschema.XmlDocument(tf.ome_metadata)
-#             if not xml_document.schema.is_valid(xml_document):
-#                 return f"{path} is not a valid OME.TIFF file"
-#         except Exception as excp:
-#             rslt_list.append(f"{path} is not a valid OME.TIFF file: {excp}")
-#     return rslt_list
-#
-# def _glob_paths(path: str):
-#     filenames_to_test = []
-#     for glob_expr in ['**/*.ome.tif', '**/*.ome.tiff', '**/*.OME.TIFF', '**/*.OME.TIF']:
-#         for file in path.glob(glob_expr):
-#             filenames_to_test.append(file)
-#     return filenames_to_test
-#
-#
 class OmeTiffValidator(Validator):
     description = "Recursively test all ome-tiff files for validity"
     cost = 1.0
@@ -44,22 +24,3 @@ def collect_errors(self, **kwargs) -> List[str]:
                     except Exception as excp:
                         rslt.append(f'{file} is not a valid OME.TIFF file: {excp}')
         return rslt
-    # def collect_errors(self, **kwargs) -> List[str]:
-    #     del kwargs
-    #     filenames_to_test = list(
-    #         file
-    #         for file in self.pool.imap_unordered(_glob_paths, self.paths)
-    #         if file is not None
-    #     )
-    #     # TODO: this is awful
-    #     return list(
-    #         x
-    #         for y in list(
-    #             rslt
-    #             for rslt in self.pool.imap_unordered(
-    #                 _check_ome_tiff_file, filenames_to_test
-    #             )
-    #             if rslt is not None
-    #         )
-    #         for x in y
-    #     )
diff --git a/src/ingest_validation_tests/tiff_validator.py b/src/ingest_validation_tests/tiff_validator.py
index 2b22ec1..ab033e7 100644
--- a/src/ingest_validation_tests/tiff_validator.py
+++ b/src/ingest_validation_tests/tiff_validator.py
@@ -33,59 +33,18 @@ def _check_tiff_file(path: str) -> str or None:
         return f"{path} is not a valid TIFF file: {excp}"
 
 
-# def _check_tiff_file(paths: List) -> Optional[str]:
-#     rslt_list = []
-#     for path in paths:
-#         try:
-#             with tifffile.TiffFile(path) as tfile:
-#                 for page in tfile.pages:
-#                     _ = page.asarray()  # force decompression
-#             return None
-#         except Exception as excp:
-#             _log(f"{str(path)} is not a valid TIFF file: {excp}")
-#             rslt_list.append(f"{path} is not a valid TIFF file: {excp}")
-#     return rslt_list
-#
-
-
-# def _glob_paths(path: str):
-#     filenames_to_test = []
-#     for glob_expr in ["**/*.tif", "**/*.tiff", "**/*.TIFF", "**/*.TIF"]:
-#         for file in path.glob(glob_expr):
-#             filenames_to_test.append(file)
-#     return filenames_to_test
-
-
 class TiffValidator(Validator):
     description = "Recursively test all tiff files that are not ome.tiffs for validity"
     cost = 1.0
 
     def collect_errors(self, **kwargs) -> List[str]:
-        del kwargs
+        threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1
+        pool = Pool(threads)
         filenames_to_test = []
-        for glob_expr in ["**/*.tif", "**/*.tiff", "**/*.TIFF", "**/*.TIF"]:
+        for glob_expr in ['**/*.tif', '**/*.tiff', '**/*.TIFF', '**/*.TIF']:
             for path in self.paths:
                 for file in path.glob(glob_expr):
                     filenames_to_test.append(file)
-        return list(
-            rslt
-            for rslt in self.pool.imap_unordered(_check_tiff_file, filenames_to_test)
-            if rslt is not None
-        )
-        # filenames_to_test = list(
-        #     file
-        #     for file in self.pool.imap_unordered(_glob_paths, self.paths)
-        #     if file is not None
-        # )
-        # # TODO: this is awful
-        # return list(
-        #     x
-        #     for y in list(
-        #         rslt
-        #         for rslt in self.pool.imap_unordered(
-        #             _check_tiff_file, filenames_to_test
-        #         )
-        #         if rslt is not None
-        #     )
-        #     for x in y
-        # )
+        return list(rslt for rslt in pool.imap_unordered(_check_tiff_file,
+                                                              filenames_to_test)
+            if rslt is not None)

From 1a2626aea6dbb04a5e7ed12177de268de5c27122 Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Tue, 23 Jan 2024 12:28:46 -0500
Subject: [PATCH 06/30] bugfixing

---
 src/ingest_validation_tests/codex_json_validator.py  | 1 +
 src/ingest_validation_tests/fastq_validator_logic.py | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/ingest_validation_tests/codex_json_validator.py b/src/ingest_validation_tests/codex_json_validator.py
index 5700cfe..1e9e9ba 100644
--- a/src/ingest_validation_tests/codex_json_validator.py
+++ b/src/ingest_validation_tests/codex_json_validator.py
@@ -11,6 +11,7 @@ class CodexJsonValidator(Validator):
     cost = 1.0
 
     def collect_errors(self, **kwargs) -> List[str]:
+        del kwargs
         if 'codex' not in self.assay_type.lower():
             return []
 
diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py
index 315d2ae..0065f06 100644
--- a/src/ingest_validation_tests/fastq_validator_logic.py
+++ b/src/ingest_validation_tests/fastq_validator_logic.py
@@ -3,8 +3,7 @@
 import logging
 import re
 from collections import defaultdict
-from multiprocessing import Manager, Pool, pool
-from os import cpu_count
+from multiprocessing import Lock, Manager, Pool
 from pathlib import Path
 from typing import Callable, List, TextIO
 
@@ -207,7 +206,7 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None:
                 else:
                     self._file_prefix_counts[filename_prefix] = records_read
 
-    def validate_fastq_files_in_path(self, paths: List[Path], pool: pool.Pool) -> None:
+    def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None:
         data_found_one = []
         dirs_and_files = defaultdict(dict)
         for path in paths:
@@ -219,6 +218,7 @@ def validate_fastq_files_in_path(self, paths: List[Path], pool: pool.Pool) -> No
                 for _, file_list in rel_path.items():
                     try:
                         logging.info(f"Passing paths {paths} to engine.")
+                        pool = Pool(threads)
                         engine = Engine(self, paths, lock)
                         data_output = pool.imap_unordered(engine, file_list)
                     except Exception as e:
@@ -240,7 +240,7 @@ def main():
     args = parser.parse_args()
 
     validator = FASTQValidatorLogic(True)
-    validator.validate_fastq_files_in_path(args.filepaths)
+    validator.validate_fastq_files_in_path(args.filepaths, Lock())
 
 
 if __name__ == '__main__':

From adc24ef65881075cb3475450cde9607f8351be97 Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Tue, 23 Jan 2024 13:58:29 -0500
Subject: [PATCH 07/30] compare full paths rather than filenames while making
 sure there are not duplicate files in a dir

---
 src/ingest_validation_tests/fastq_validator.py       | 6 +++---
 src/ingest_validation_tests/fastq_validator_logic.py | 8 ++++----
 src/ingest_validation_tests/tiff_validator.py        | 4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/ingest_validation_tests/fastq_validator.py b/src/ingest_validation_tests/fastq_validator.py
index d31335f..0a30e42 100644
--- a/src/ingest_validation_tests/fastq_validator.py
+++ b/src/ingest_validation_tests/fastq_validator.py
@@ -1,8 +1,8 @@
 from os import cpu_count
 from typing import List
 
-from fastq_validator_logic import FASTQValidatorLogic, _log
 from ingest_validation_tools.plugin_validator import Validator
+from fastq_validator_logic import FASTQValidatorLogic, _log
 
 
 class FASTQValidator(Validator):
@@ -10,8 +10,8 @@ class FASTQValidator(Validator):
     cost = 15.0
 
     def collect_errors(self, **kwargs) -> List[str]:
-        threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1
-        _log(f"Threading at {threads}")
+        threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1
+        _log(f'Threading at {threads}')
         validator = FASTQValidatorLogic(verbose=True)
         validator.validate_fastq_files_in_path(self.paths, threads)
         return validator.errors
diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py
index 0065f06..0b78e21 100644
--- a/src/ingest_validation_tests/fastq_validator_logic.py
+++ b/src/ingest_validation_tests/fastq_validator_logic.py
@@ -37,7 +37,7 @@ def __call__(self, fastq_file):
         for path in self.paths:
             _log(f"Validating matching fastq files in {path.as_posix()}")
             self.validate_object.validate_fastq_file(path / fastq_file, self.lock)
-            errors.append(next(iter(self.validate_object.errors), None))
+            errors.append(next(iter(self.validate_object.errors)))
         return errors
 
 
@@ -175,11 +175,11 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None:
                 self._format_error("Unable to open FASTQ data file."))
             return
 
-        if fastq_file.name in self._file_record_counts.keys():
+        if fastq_file in self._file_record_counts.keys():
             self.errors.append(_log(
-                f"{fastq_file.name} has been found multiple times during this "
+                f"{fastq_file} has been found ultiple times during this "
                 "validation."))
-        self._file_record_counts[fastq_file.name] = records_read
+        self._file_record_counts[fastq_file] = records_read
 
         match = self._FASTQ_FILE_PREFIX_REGEX.match(fastq_file.name)
         with lock:
diff --git a/src/ingest_validation_tests/tiff_validator.py b/src/ingest_validation_tests/tiff_validator.py
index ab033e7..57aba4d 100644
--- a/src/ingest_validation_tests/tiff_validator.py
+++ b/src/ingest_validation_tests/tiff_validator.py
@@ -1,7 +1,6 @@
 from multiprocessing import Pool
 from os import cpu_count
-from pathlib import Path
-from typing import List, Optional
+from typing import List
 
 import tifffile
 from ingest_validation_tools.plugin_validator import Validator
@@ -41,6 +40,7 @@ def collect_errors(self, **kwargs) -> List[str]:
         threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1
         pool = Pool(threads)
         filenames_to_test = []
+        # TODO: this does not exclude OME.TIFF files, should it?
         for glob_expr in ['**/*.tif', '**/*.tiff', '**/*.TIFF', '**/*.TIF']:
             for path in self.paths:
                 for file in path.glob(glob_expr):

From be508e359710e060d35f137363145c38141ca231 Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Wed, 24 Jan 2024 15:55:07 -0500
Subject: [PATCH 08/30] making tests pass

---
 requirements.txt                              | 10 ++--
 .../codex_common_errors_validator.py          | 15 ++++-
 .../fastq_validator_logic.py                  | 44 +++++++++------
 src/ingest_validation_tests/gz_validator.py   | 55 ++++++++++---------
 tests/test_codex_common_errors_validator.py   |  2 +-
 tests/test_fastq_validator_logic.py           | 14 ++---
 6 files changed, 81 insertions(+), 59 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index e5a685b..9ac14b6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
-xmlschema>=1.6
-tifffile==2020.10.1
+git+https://github.com/hubmapconsortium/fastq-utils.git@v0.2.5#egg=hubmap-fastq-utils
+imagecodecs>=2023.3.16
 jsonschema==4.4.0
 pandas>=1.2.0
-imagecodecs>=2023.3.16
-python-frontmatter>=1.0.0
-git+https://github.com/hubmapconsortium/fastq-utils.git@v0.2.5#egg=hubmap-fastq-utils
+python-frontmatter>=1.1.0
+tifffile==2020.10.1
+xmlschema>=1.6
diff --git a/src/ingest_validation_tests/codex_common_errors_validator.py b/src/ingest_validation_tests/codex_common_errors_validator.py
index e98c3c6..8954b05 100644
--- a/src/ingest_validation_tests/codex_common_errors_validator.py
+++ b/src/ingest_validation_tests/codex_common_errors_validator.py
@@ -58,6 +58,7 @@ def collect_errors(self, **kwargs) -> List[str]:
         rslts = []
         for path in self.paths:
             rslt = []
+            # TODO: this seems to be replicating dir validation? Is this necessary for legacy support?
             try:
                 # is the raw/src_ directory present?
                 prefix = None
@@ -90,8 +91,8 @@ def collect_errors(self, **kwargs) -> List[str]:
                 # is the segmentation.json file on the right side?
                 found = False
                 right_place = False
-                for path in path.glob('*/[Ss]egmentation.json'):
-                    rel_path = path.relative_to(path)
+                for filepath in path.glob('*/[Ss]egmentation.json'):
+                    rel_path = filepath.relative_to(path)
                     found = True
                     if str(rel_path).startswith(('raw', 'src_')):
                         right_place = True
@@ -113,6 +114,7 @@ def collect_errors(self, **kwargs) -> List[str]:
                     if not channelnames_txt_path.is_file():
                         rslt.append('channelnames.txt is missing')
                         raise QuitNowException()
+                # TODO: end questionable portion
 
                 # Parse channelnames.txt into a dataframe
                 try:
@@ -214,8 +216,15 @@ def collect_errors(self, **kwargs) -> List[str]:
                 if failures:
                     rslt += failures
                     raise QuitNowException()
-                rslts.append(rslt)
+                if type(rslt) is list:
+                    rslts.extend(rslt)
+                else:
+                    rslts.append(rslt)
 
             except QuitNowException:
+                if type(rslt) is list:
+                    rslts.extend(rslt)
+                else:
+                    rslts.append(rslt)
                 pass
         return rslts
diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py
index 0b78e21..9e212b3 100644
--- a/src/ingest_validation_tests/fastq_validator_logic.py
+++ b/src/ingest_validation_tests/fastq_validator_logic.py
@@ -27,17 +27,17 @@ def _log(message: str) -> str:
 
 
 class Engine(object):
-    def __init__(self, validate_object, paths: List[Path], lock):
+    def __init__(self, validate_object, path: Path, lock):
         self.validate_object = validate_object
-        self.paths = paths
+        self.path = path
         self.lock = lock
 
     def __call__(self, fastq_file):
         errors = []
-        for path in self.paths:
-            _log(f"Validating matching fastq files in {path.as_posix()}")
-            self.validate_object.validate_fastq_file(path / fastq_file, self.lock)
-            errors.append(next(iter(self.validate_object.errors)))
+        _log(f"Validating matching fastq files in {self.path}")
+        self.validate_object.validate_fastq_file(self.path / fastq_file, self.lock)
+        for err in self.validate_object.errors:
+            errors.append(err)
         return errors
 
 
@@ -174,12 +174,13 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None:
             self.errors.append(
                 self._format_error("Unable to open FASTQ data file."))
             return
-
-        if fastq_file in self._file_record_counts.keys():
+        # TODO: this locates duplicate filenames across dirs, is that right?
+        # Difficult to log instances because first is not captured.
+        if fastq_file.name in self._file_record_counts.keys():
             self.errors.append(_log(
-                f"{fastq_file} has been found ultiple times during this "
+                f"{fastq_file.name} has been found multiple times during this "
                 "validation."))
-        self._file_record_counts[fastq_file] = records_read
+        self._file_record_counts[fastq_file.name] = records_read
 
         match = self._FASTQ_FILE_PREFIX_REGEX.match(fastq_file.name)
         with lock:
@@ -214,19 +215,20 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None:
             _log(f"Added files from {path} to dirs_and_files: {dirs_and_files}")
         with Manager() as manager:
             lock = manager.Lock()
-            for path, rel_path in dirs_and_files.items():
-                for _, file_list in rel_path.items():
+            for path, rel_paths in dirs_and_files.items():
+                for rel_path, file_list in rel_paths.items():
+                    fullpath = Path(path / rel_path)
                     try:
-                        logging.info(f"Passing paths {paths} to engine.")
+                        logging.info(f"Passing paths {fullpath} to engine.")
                         pool = Pool(threads)
-                        engine = Engine(self, paths, lock)
+                        engine = Engine(self, fullpath, lock)
                         data_output = pool.imap_unordered(engine, file_list)
                     except Exception as e:
                         _log(f"Error {e}")
                     else:
                         pool.close()
                         pool.join()
-                        [data_found_one.append(output) for output in data_output if output]
+                        [data_found_one.extend(output) for output in data_output if output]
 
         if len(data_found_one) > 0:
             self.errors = data_found_one
@@ -238,9 +240,19 @@ def main():
                         help="Files to validate for FASTQ syntax")
 
     args = parser.parse_args()
+    if isinstance(args.filepaths, List):
+        filepaths = [Path(path) for path in args.filepaths]
+    elif isinstance(args.filepaths, Path):
+        filepaths = [args.filepaths]
+    elif isinstance(args.filepaths, str):
+        filepaths = [Path(args.filepaths)]
+    else:
+        raise Exception(
+            f"Validator init received base_paths arg as type {type(args.filepaths)}"
+        )
 
     validator = FASTQValidatorLogic(True)
-    validator.validate_fastq_files_in_path(args.filepaths, Lock())
+    validator.validate_fastq_files_in_path(filepaths, Lock())
 
 
 if __name__ == '__main__':
diff --git a/src/ingest_validation_tests/gz_validator.py b/src/ingest_validation_tests/gz_validator.py
index e6ca6d5..c29653f 100644
--- a/src/ingest_validation_tests/gz_validator.py
+++ b/src/ingest_validation_tests/gz_validator.py
@@ -12,21 +12,20 @@ def _log(message: str):
 
 
 class Engine(object):
-    def __call__(self, path):
-        for filename in path.glob('**/*.gz'):
-            excluded = r'.*/*fastq.gz'
-            if re.search(excluded, filename.as_posix()):
-                return
-            try:
-                _log(f'Threaded {filename}')
-                with gzip.open(filename) as g_f:
-                    while True:
-                        buf = g_f.read(1024*1024)
-                        if not buf:
-                            break
-            except Exception as e:
-                _log(f'{filename} is not a valid gzipped file {e}')
-                return f'{filename} is not a valid gzipped file'
+    def __call__(self, filename):
+        excluded = r".*/*fastq.gz"
+        if re.search(excluded, filename.as_posix()):
+            return
+        try:
+            _log(f"Threaded {filename}")
+            with gzip.open(filename) as g_f:
+                while True:
+                    buf = g_f.read(1024 * 1024)
+                    if not buf:
+                        break
+        except Exception as e:
+            _log(f"{filename} is not a valid gzipped file {e}")
+            return f"{filename} is not a valid gzipped file"
 
 
 class GZValidator(Validator):
@@ -35,16 +34,18 @@ class GZValidator(Validator):
 
     def collect_errors(self, **kwargs) -> List[str]:
         data_output2 = []
-        threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1
-        _log(f'Threading at {threads}')
-        try:
-            pool = Pool(threads)
-            engine = Engine()
-            data_output = pool.imap_unordered(engine, self.paths)
-        except Exception as e:
-            _log(f'Error {e}')
-        else:
-            pool.close()
-            pool.join()
-            [data_output2.append(output) for output in data_output if output]
+        threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1
+        _log(f"Threading at {threads}")
+        for path in self.paths:
+            for glob_expr in ["**/*.gz"]:
+                try:
+                    pool = Pool(threads)
+                    engine = Engine()
+                    data_output = pool.imap_unordered(engine, path.glob(glob_expr))
+                except Exception as e:
+                    _log(f"Error {e}")
+                else:
+                    pool.close()
+                    pool.join()
+                    [data_output2.append(output) for output in data_output if output]
         return data_output2
diff --git a/tests/test_codex_common_errors_validator.py b/tests/test_codex_common_errors_validator.py
index 256584c..23cd5e7 100644
--- a/tests/test_codex_common_errors_validator.py
+++ b/tests/test_codex_common_errors_validator.py
@@ -40,7 +40,7 @@ def test_codex_common_errors_validator(test_data_fname, msg_starts_list, tmp_pat
     test_data_path = Path(test_data_fname)
     zfile = zipfile.ZipFile(test_data_path)
     zfile.extractall(tmp_path)
-    validator = CodexCommonErrorsValidator(tmp_path / test_data_path.stem,
+    validator = CodexCommonErrorsValidator([Path(tmp_path / test_data_path.stem)],
                                            'CODEX'
                                            )
     errors = validator.collect_errors()[:]
diff --git a/tests/test_fastq_validator_logic.py b/tests/test_fastq_validator_logic.py
index 967c2c4..55dadef 100644
--- a/tests/test_fastq_validator_logic.py
+++ b/tests/test_fastq_validator_logic.py
@@ -39,7 +39,7 @@ def fastq_validator(self) -> FASTQValidatorLogic:
         return FASTQValidatorLogic()
 
     def test_fastq_validator_no_files(self, fastq_validator, tmp_path):
-        fastq_validator.validate_fastq_files_in_path(tmp_path, 2)
+        fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
         # This case should return no errors
         assert fastq_validator.errors == []
 
@@ -65,7 +65,7 @@ def test_fastq_validator_unrecognized_file(self, fastq_validator,
 
     def test_fastq_validator_empty_directory(self, fastq_validator,
                                              tmp_path):
-        fastq_validator.validate_fastq_files_in_path(tmp_path, 2)
+        fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
         # No files in path means no errors
         assert fastq_validator.errors == []
 
@@ -76,7 +76,7 @@ def test_fastq_validator_basic(self, fastq_validator, tmp_path, use_gzip):
         with _open_output_file(test_file, use_gzip) as output:
             output.write(_GOOD_RECORDS)
 
-        fastq_validator.validate_fastq_files_in_path(tmp_path, 2)
+        fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
         assert not fastq_validator.errors
 
     def test_fastq_validator_bad_file(self, fastq_validator, tmp_path):
@@ -84,7 +84,7 @@ def test_fastq_validator_bad_file(self, fastq_validator, tmp_path):
         with _open_output_file(test_file, False) as output:
             output.write('ABCDEF')
 
-        fastq_validator.validate_fastq_files_in_path(tmp_path, 2)
+        fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
 
         # This test does not assert specific validations but instead that the
         # overall file failed and that error messages were returned.
@@ -98,7 +98,7 @@ def test_fastq_validator_duplicate_file(self, fastq_validator, tmp_path):
                                    False) as output:
                 output.write(_GOOD_RECORDS)
 
-        fastq_validator.validate_fastq_files_in_path(tmp_path, 2)
+        fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
         assert "test.fastq has been found multiple times" in \
                fastq_validator.errors[0]
 
@@ -192,7 +192,7 @@ def test_fastq_validator_record_counts_good(self, fastq_validator,
             with _open_output_file(new_file, False) as output:
                 output.write(_GOOD_RECORDS)
 
-        fastq_validator.validate_fastq_files_in_path(tmp_path, 2)
+        fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
 
         assert not fastq_validator.errors
 
@@ -208,7 +208,7 @@ def test_fastq_validator_record_counts_bad(self, fastq_validator,
             output.write(_GOOD_RECORDS)
             output.write(_GOOD_RECORDS)
 
-        fastq_validator.validate_fastq_files_in_path(tmp_path, 2)
+        fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
 
         # Order of the files being processed is not guaranteed, however these
         # strings ensure that a mismatch was found.

From 7d9d55ad5ee6f5a4f41149d0f464f90007b5e3d8 Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Wed, 24 Jan 2024 16:36:37 -0500
Subject: [PATCH 09/30] small linting change

---
 src/ingest_validation_tests/ome_tiff_validator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/ingest_validation_tests/ome_tiff_validator.py b/src/ingest_validation_tests/ome_tiff_validator.py
index 2979035..e4df25a 100644
--- a/src/ingest_validation_tests/ome_tiff_validator.py
+++ b/src/ingest_validation_tests/ome_tiff_validator.py
@@ -1,5 +1,4 @@
 from typing import List
-from pathlib import Path
 
 import tifffile
 import xmlschema

From f923db4ac9d89672819dbc3ce47e9886100f5cc7 Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Thu, 25 Jan 2024 12:09:11 -0500
Subject: [PATCH 10/30] testing different way of passing files for concurrent
 processing

---
 .../fastq_validator_logic.py                  | 36 ++++++++++---------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py
index 9e212b3..37791cf 100644
--- a/src/ingest_validation_tests/fastq_validator_logic.py
+++ b/src/ingest_validation_tests/fastq_validator_logic.py
@@ -27,15 +27,14 @@ def _log(message: str) -> str:
 
 
 class Engine(object):
-    def __init__(self, validate_object, path: Path, lock):
+    def __init__(self, validate_object, lock):
         self.validate_object = validate_object
-        self.path = path
         self.lock = lock
 
     def __call__(self, fastq_file):
         errors = []
-        _log(f"Validating matching fastq files in {self.path}")
-        self.validate_object.validate_fastq_file(self.path / fastq_file, self.lock)
+        _log(f"Validating matching fastq file {fastq_file}")
+        self.validate_object.validate_fastq_file(fastq_file, self.lock)
         for err in self.validate_object.errors:
             errors.append(err)
         return errors
@@ -176,6 +175,7 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None:
             return
         # TODO: this locates duplicate filenames across dirs, is that right?
         # Difficult to log instances because first is not captured.
+        # ALSO this doesn't work reliably when more threads are running concurrently
         if fastq_file.name in self._file_record_counts.keys():
             self.errors.append(_log(
                 f"{fastq_file.name} has been found multiple times during this "
@@ -213,22 +213,24 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None:
         for path in paths:
             dirs_and_files[path] = fastq_utils.collect_fastq_files_by_directory(path)
             _log(f"Added files from {path} to dirs_and_files: {dirs_and_files}")
+        file_list = []
         with Manager() as manager:
             lock = manager.Lock()
             for path, rel_paths in dirs_and_files.items():
-                for rel_path, file_list in rel_paths.items():
-                    fullpath = Path(path / rel_path)
-                    try:
-                        logging.info(f"Passing paths {fullpath} to engine.")
-                        pool = Pool(threads)
-                        engine = Engine(self, fullpath, lock)
-                        data_output = pool.imap_unordered(engine, file_list)
-                    except Exception as e:
-                        _log(f"Error {e}")
-                    else:
-                        pool.close()
-                        pool.join()
-                        [data_found_one.extend(output) for output in data_output if output]
+                for rel_path, files in rel_paths.items():
+                    for file in files:
+                        file_list.append(Path(path / rel_path / file))
+            try:
+                logging.info(f"Passing file list for paths {paths} to engine. File list: {file_list}.")
+                pool = Pool(threads)
+                engine = Engine(self, lock)
+                data_output = pool.imap_unordered(engine, file_list)
+            except Exception as e:
+                _log(f"Error {e}")
+            else:
+                pool.close()
+                pool.join()
+                [data_found_one.extend(output) for output in data_output if output]
 
         if len(data_found_one) > 0:
             self.errors = data_found_one

From 5e7bd34fe4067c5f3aadf0dc4ce43934f30968bd Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Thu, 25 Jan 2024 12:19:51 -0500
Subject: [PATCH 11/30] replicating changes to ome_tiff/tiff checking

---
 .../ome_tiff_validator.py                     | 38 +++++++++++++------
 src/ingest_validation_tests/tiff_validator.py |  4 +-
 2 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/src/ingest_validation_tests/ome_tiff_validator.py b/src/ingest_validation_tests/ome_tiff_validator.py
index e4df25a..e7e758d 100644
--- a/src/ingest_validation_tests/ome_tiff_validator.py
+++ b/src/ingest_validation_tests/ome_tiff_validator.py
@@ -1,3 +1,5 @@
+from multiprocessing import Pool
+from os import cpu_count
 from typing import List
 
 import tifffile
@@ -5,21 +7,35 @@
 from ingest_validation_tools.plugin_validator import Validator
 
 
+def _check_ome_tiff_file(file: str):
+    try:
+        with tifffile.TiffFile(file) as tf:
+            xml_document = xmlschema.XmlDocument(tf.ome_metadata)
+        if xml_document.schema and not xml_document.schema.is_valid(xml_document):
+            return f"{file} is not a valid OME.TIFF file"
+    except Exception as excp:
+        return f"{file} is not a valid OME.TIFF file: {excp}"
+
+
 class OmeTiffValidator(Validator):
     description = "Recursively test all ome-tiff files for validity"
     cost = 1.0
 
     def collect_errors(self, **kwargs) -> List[str]:
-        del kwargs
-        rslt = []
-        for glob_expr in ['**/*.ome.tif', '**/*.ome.tiff', '**/*.OME.TIFF', '**/*.OME.TIF']:
+        threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1
+        pool = Pool(threads)
+        filenames_to_test = []
+        for glob_expr in [
+            "**/*.ome.tif",
+            "**/*.ome.tiff",
+            "**/*.OME.TIFF",
+            "**/*.OME.TIF",
+        ]:
             for path in self.paths:
                 for file in path.glob(glob_expr):
-                    try:
-                        with tifffile.TiffFile(file) as tf:
-                            xml_document = xmlschema.XmlDocument(tf.ome_metadata)
-                        if not xml_document.schema.is_valid(xml_document):
-                            rslt.append(f'{file} is not a valid OME.TIFF file')
-                    except Exception as excp:
-                        rslt.append(f'{file} is not a valid OME.TIFF file: {excp}')
-        return rslt
+                    filenames_to_test.append(file)
+        return list(
+            rslt
+            for rslt in pool.imap_unordered(_check_ome_tiff_file, filenames_to_test)
+            if rslt is not None
+        )
diff --git a/src/ingest_validation_tests/tiff_validator.py b/src/ingest_validation_tests/tiff_validator.py
index 57aba4d..8615646 100644
--- a/src/ingest_validation_tests/tiff_validator.py
+++ b/src/ingest_validation_tests/tiff_validator.py
@@ -1,6 +1,6 @@
 from multiprocessing import Pool
 from os import cpu_count
-from typing import List
+from typing import List, Optional
 
 import tifffile
 from ingest_validation_tools.plugin_validator import Validator
@@ -21,7 +21,7 @@ def _log(message: str):
     print(message)
 
 
-def _check_tiff_file(path: str) -> str or None:
+def _check_tiff_file(path: str) -> Optional[str]:
     try:
         with tifffile.TiffFile(path) as tfile:
             for page in tfile.pages:

From dad0b7d679ea797b7014e32f3234405f3a8413a4 Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Thu, 25 Jan 2024 12:26:49 -0500
Subject: [PATCH 12/30] updated gz_validator

---
 src/ingest_validation_tests/gz_validator.py | 22 +++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/ingest_validation_tests/gz_validator.py b/src/ingest_validation_tests/gz_validator.py
index c29653f..58ad671 100644
--- a/src/ingest_validation_tests/gz_validator.py
+++ b/src/ingest_validation_tests/gz_validator.py
@@ -36,16 +36,18 @@ def collect_errors(self, **kwargs) -> List[str]:
         data_output2 = []
         threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1
         _log(f"Threading at {threads}")
+        file_list = []
         for path in self.paths:
             for glob_expr in ["**/*.gz"]:
-                try:
-                    pool = Pool(threads)
-                    engine = Engine()
-                    data_output = pool.imap_unordered(engine, path.glob(glob_expr))
-                except Exception as e:
-                    _log(f"Error {e}")
-                else:
-                    pool.close()
-                    pool.join()
-                    [data_output2.append(output) for output in data_output if output]
+                file_list.extend(path.glob(glob_expr))
+        try:
+            pool = Pool(threads)
+            engine = Engine()
+            data_output = pool.imap_unordered(engine, file_list)
+        except Exception as e:
+            _log(f"Error {e}")
+        else:
+            pool.close()
+            pool.join()
+            [data_output2.append(output) for output in data_output if output]
         return data_output2

From 5b2f835be9bcd42163702f315ea156f05fc81482 Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Thu, 25 Jan 2024 12:41:27 -0500
Subject: [PATCH 13/30] cleaning up TODOs

---
 .../codex_common_errors_validator.py                       | 2 --
 src/ingest_validation_tests/fastq_validator_logic.py       | 7 +++++--
 src/ingest_validation_tests/tiff_validator.py              | 1 -
 tests/test_fastq_validator_logic.py                        | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/ingest_validation_tests/codex_common_errors_validator.py b/src/ingest_validation_tests/codex_common_errors_validator.py
index 8954b05..391e9cc 100644
--- a/src/ingest_validation_tests/codex_common_errors_validator.py
+++ b/src/ingest_validation_tests/codex_common_errors_validator.py
@@ -58,7 +58,6 @@ def collect_errors(self, **kwargs) -> List[str]:
         rslts = []
         for path in self.paths:
             rslt = []
-            # TODO: this seems to be replicating dir validation? Is this necessary for legacy support?
             try:
                 # is the raw/src_ directory present?
                 prefix = None
@@ -114,7 +113,6 @@ def collect_errors(self, **kwargs) -> List[str]:
                     if not channelnames_txt_path.is_file():
                         rslt.append('channelnames.txt is missing')
                         raise QuitNowException()
-                # TODO: end questionable portion
 
                 # Parse channelnames.txt into a dataframe
                 try:
diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py
index 37791cf..ad52ee8 100644
--- a/src/ingest_validation_tests/fastq_validator_logic.py
+++ b/src/ingest_validation_tests/fastq_validator_logic.py
@@ -168,10 +168,13 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None:
         try:
             with _open_fastq_file(fastq_file) as fastq_data:
                 records_read = self.validate_fastq_stream(fastq_data)
-            # TODO: Add gzip.BadGzipFile when Python 3.8 is available
+        except gzip.BadGzipFile:
+            self.errors.append(
+                self._format_error(f"Bad gzip file: {fastq_file}."))
+            return
         except IOError:
             self.errors.append(
-                self._format_error("Unable to open FASTQ data file."))
+                self._format_error(f"Unable to open FASTQ data file {fastq_file}."))
             return
         # TODO: this locates duplicate filenames across dirs, is that right?
         # Difficult to log instances because first is not captured.
diff --git a/src/ingest_validation_tests/tiff_validator.py b/src/ingest_validation_tests/tiff_validator.py
index 8615646..db840c1 100644
--- a/src/ingest_validation_tests/tiff_validator.py
+++ b/src/ingest_validation_tests/tiff_validator.py
@@ -40,7 +40,6 @@ def collect_errors(self, **kwargs) -> List[str]:
         threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1
         pool = Pool(threads)
         filenames_to_test = []
-        # TODO: this does not exclude OME.TIFF files, should it?
         for glob_expr in ['**/*.tif', '**/*.tiff', '**/*.TIFF', '**/*.TIF']:
             for path in self.paths:
                 for file in path.glob(glob_expr):
diff --git a/tests/test_fastq_validator_logic.py b/tests/test_fastq_validator_logic.py
index 55dadef..797abc5 100644
--- a/tests/test_fastq_validator_logic.py
+++ b/tests/test_fastq_validator_logic.py
@@ -51,7 +51,7 @@ def test_fastq_validator_bad_gzip_data(self, fastq_validator, tmp_path):
             output.write(_GOOD_RECORDS)
 
         fastq_validator.validate_fastq_file(test_file, Lock())
-        assert "Unable to open" in fastq_validator.errors[0]
+        assert "Bad gzip file" in fastq_validator.errors[0]
 
     def test_fastq_validator_unrecognized_file(self, fastq_validator,
                                                tmp_path):

From 1c39a39e187c689f7d09b8a82002d3cdaa85d611 Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Thu, 25 Jan 2024 16:55:54 -0500
Subject: [PATCH 14/30] fixing fastq per-dataset validation

---
 .../fastq_validator_logic.py                  | 100 ++++++++++--------
 tests/test_fastq_validator_logic.py           |   8 +-
 2 files changed, 58 insertions(+), 50 deletions(-)

diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py
index ad52ee8..6784a8b 100644
--- a/src/ingest_validation_tests/fastq_validator_logic.py
+++ b/src/ingest_validation_tests/fastq_validator_logic.py
@@ -27,14 +27,13 @@ def _log(message: str) -> str:
 
 
 class Engine(object):
-    def __init__(self, validate_object, lock):
+    def __init__(self, validate_object):
         self.validate_object = validate_object
-        self.lock = lock
 
     def __call__(self, fastq_file):
         errors = []
         _log(f"Validating matching fastq file {fastq_file}")
-        self.validate_object.validate_fastq_file(fastq_file, self.lock)
+        self.validate_object.validate_fastq_file(fastq_file)
         for err in self.validate_object.errors:
             errors.append(err)
         return errors
@@ -55,19 +54,14 @@ class FASTQValidatorLogic:
     contain the same number of symbols as letters in the sequence.
     """
 
-    # This pattern seeks out the string that includes the lane number (since
-    # that is expected to be present to help anchor the prefix) that comes
-    # before any of _I1, _I2, _R1, or _R2.
-    _FASTQ_FILE_PREFIX_REGEX = re.compile(r'(.+_L\d+.*)_[IR][12][._]')
-
     _FASTQ_LINE_2_VALID_CHARS = 'ACGNT'
 
     def __init__(self, verbose=False):
         self.errors: List[str] = []
-        self._filename = ''
-        self._line_number = 0
         self._file_record_counts = Manager().dict()
         self._file_prefix_counts = Manager().dict()
+        self._filename = ''
+        self._line_number = 0
 
         self._verbose = verbose
 
@@ -151,7 +145,7 @@ def validate_fastq_stream(self, fastq_data: TextIO) -> int:
 
         return line_count + 1
 
-    def validate_fastq_file(self, fastq_file: Path, lock) -> None:
+    def validate_fastq_file(self, fastq_file: Path) -> None:
         _log(f"Validating {fastq_file.name}...")
         _log(f"    → {fastq_file.absolute().as_posix()}")
 
@@ -176,39 +170,7 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None:
             self.errors.append(
                 self._format_error(f"Unable to open FASTQ data file {fastq_file}."))
             return
-        # TODO: this locates duplicate filenames across dirs, is that right?
-        # Difficult to log instances because first is not captured.
-        # ALSO this doesn't work reliably when more threads are running concurrently
-        if fastq_file.name in self._file_record_counts.keys():
-            self.errors.append(_log(
-                f"{fastq_file.name} has been found multiple times during this "
-                "validation."))
-        self._file_record_counts[fastq_file.name] = records_read
-
-        match = self._FASTQ_FILE_PREFIX_REGEX.match(fastq_file.name)
-        with lock:
-            if match:
-                filename_prefix = match.group(1)
-                if filename_prefix in self._file_prefix_counts.keys():
-                    extant_count = self._file_prefix_counts[filename_prefix]
-                    if extant_count != records_read:
-                        # Find a file we've validated already that matches this
-                        # prefix.
-                        extant_files = [
-                            filename for filename, record_count
-                            in self._file_record_counts.items()
-                            if record_count == extant_count and filename.startswith(filename_prefix)
-                        ]
-                        # Based on how the dictionaries are created, there should
-                        # always be at least one matching filename.
-                        assert extant_files
-
-                        self.errors.append(_log(
-                            f"{fastq_file.name} ({records_read} lines) "
-                            f"does not match length of {extant_files[0]} "
-                            f"({extant_count} lines)."))
-                else:
-                    self._file_prefix_counts[filename_prefix] = records_read
+        self._file_record_counts[str(fastq_file)] = records_read
 
     def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None:
         data_found_one = []
@@ -226,17 +188,63 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None:
             try:
                 logging.info(f"Passing file list for paths {paths} to engine. File list: {file_list}.")
                 pool = Pool(threads)
-                engine = Engine(self, lock)
+                engine = Engine(self)
                 data_output = pool.imap_unordered(engine, file_list)
             except Exception as e:
                 _log(f"Error {e}")
             else:
                 pool.close()
                 pool.join()
+                self._find_duplicates(dirs_and_files)
+                self._find_shared_prefixes(lock)
                 [data_found_one.extend(output) for output in data_output if output]
 
         if len(data_found_one) > 0:
-            self.errors = data_found_one
+            self.errors.extend(data_found_one)
+
+    def _find_duplicates(self, dirs_and_files):
+        for data_path, sub_dirs in dirs_and_files.items():
+            # Creates a dict of filenames to list of full filepaths for each
+            # fastq file in a given data_path (dataset dir).
+            files_per_path = defaultdict(list)
+            for sub_path, filepaths in sub_dirs.items():
+                for filepath in filepaths:
+                    files_per_path[filepath.name].append(data_path / sub_path)
+            for filename, filepaths in files_per_path.items():
+                if len(filepaths) > 1:
+                    self.errors.append(_log(
+                        f"{filename} has been found multiple times during this validation of path {data_path}. Duplicates: {filepaths}."))  # noqa: E501
+
+    def _find_shared_prefixes(self, lock):
+        # This pattern seeks out the string that includes the lane number (since
+        # that is expected to be present to help anchor the prefix) that comes
+        # before any of _I1, _I2, _R1, or _R2.
+        fastq_file_prefix_regex = re.compile(r'(.+_L\d+.*)_[IR][12][._]')
+        for fastq_file, records_read in self._file_record_counts.items():
+            match = fastq_file_prefix_regex.match(Path(fastq_file).name)
+            with lock:
+                if match:
+                    filename_prefix = match.group(1)
+                    if filename_prefix in self._file_prefix_counts.keys():
+                        extant_count = self._file_prefix_counts[filename_prefix]
+                        if extant_count != records_read:
+                            # Find a file we've validated already that matches this
+                            # prefix.
+                            extant_files = [
+                                str(Path(filepath).name) for filepath, record_count
+                                in self._file_record_counts.items()
+                                if record_count == extant_count and Path(filepath).name.startswith(filename_prefix)
+                            ]
+                            # Based on how the dictionaries are created, there should
+                            # always be at least one matching filename.
+                            assert extant_files
+
+                            self.errors.append(_log(
+                                f"{Path(fastq_file).name} ({records_read} lines) "
+                                f"does not match length of {extant_files[0]} "
+                                f"({extant_count} lines)."))
+                    else:
+                        self._file_prefix_counts[filename_prefix] = records_read
 
 
 def main():
diff --git a/tests/test_fastq_validator_logic.py b/tests/test_fastq_validator_logic.py
index 797abc5..a244cd8 100644
--- a/tests/test_fastq_validator_logic.py
+++ b/tests/test_fastq_validator_logic.py
@@ -50,7 +50,7 @@ def test_fastq_validator_bad_gzip_data(self, fastq_validator, tmp_path):
         with _open_output_file(test_file, False) as output:
             output.write(_GOOD_RECORDS)
 
-        fastq_validator.validate_fastq_file(test_file, Lock())
+        fastq_validator.validate_fastq_file(test_file)
         assert "Bad gzip file" in fastq_validator.errors[0]
 
     def test_fastq_validator_unrecognized_file(self, fastq_validator,
@@ -59,7 +59,7 @@ def test_fastq_validator_unrecognized_file(self, fastq_validator,
         with _open_output_file(test_file, False) as output:
             output.write(_GOOD_RECORDS)
 
-        fastq_validator.validate_fastq_file(test_file, Lock())
+        fastq_validator.validate_fastq_file(test_file)
         assert "Filename does not have proper format" in \
                fastq_validator.errors[0]
 
@@ -105,7 +105,7 @@ def test_fastq_validator_duplicate_file(self, fastq_validator, tmp_path):
     def test_fastq_validator_io_error(self, fastq_validator, tmp_path):
         fake_path = tmp_path.joinpath('does-not-exist.fastq')
 
-        fastq_validator.validate_fastq_file(fake_path, Lock())
+        fastq_validator.validate_fastq_file(fake_path)
 
         assert "Unable to open" in fastq_validator.errors[0]
 
@@ -178,7 +178,7 @@ def test_fastq_validator_line_4_mismatched_length(self, fastq_validator,
         with _open_output_file(new_file, False) as output:
             output.write(test_data)
 
-        fastq_validator.validate_fastq_file(new_file, Lock())
+        fastq_validator.validate_fastq_file(new_file)
         assert "contains 9 characters which does not match line 2's 10" in \
                fastq_validator.errors[0]
 

From fb7d110a4e6069d8b8c629a25c6fde43af7f56a3 Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Mon, 29 Jan 2024 16:31:38 -0500
Subject: [PATCH 15/30] slightly clearer error messaging

---
 src/ingest_validation_tests/fastq_validator_logic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py
index 6784a8b..16403e4 100644
--- a/src/ingest_validation_tests/fastq_validator_logic.py
+++ b/src/ingest_validation_tests/fastq_validator_logic.py
@@ -213,7 +213,7 @@ def _find_duplicates(self, dirs_and_files):
             for filename, filepaths in files_per_path.items():
                 if len(filepaths) > 1:
                     self.errors.append(_log(
-                        f"{filename} has been found multiple times during this validation of path {data_path}. Duplicates: {filepaths}."))  # noqa: E501
+                        f"{filename} has been found multiple times during this validation. Locations of duplicates: {filepaths}."))  # noqa: E501
 
     def _find_shared_prefixes(self, lock):
         # This pattern seeks out the string that includes the lane number (since

From ae28b42a8b8378ff87fc37b0b020be03c06aa295 Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Tue, 30 Jan 2024 12:55:20 -0500
Subject: [PATCH 16/30] updating .travis.yml with newest IVT release

---
 .travis.yml                                          | 8 ++++----
 src/ingest_validation_tests/fastq_validator_logic.py | 2 ++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index adf9a70..beeedc9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,20 +1,20 @@
 language: python
 python:
-  - '3.9'
+  - "3.9"
 cache: pip
 install:
   - pip install -r requirements.txt
   # - pip install -r requirements-dev.txt
- 
+
   - pwd
   - cd ..
   - git clone https://github.com/hubmapconsortium/ingest-validation-tools.git
   - cd ingest-validation-tools
   - pwd
-  - git checkout v0.0.15
+  - git checkout v0.0.17
   # - pip install "setuptools==60.9.0"
   - pip install -r requirements.txt
- 
+
   - cd ../ingest-validation-tests # Not sure if this is required, or if it resets between sections.
 script:
   - ./test.sh
diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py
index 16403e4..85eeb44 100644
--- a/src/ingest_validation_tests/fastq_validator_logic.py
+++ b/src/ingest_validation_tests/fastq_validator_logic.py
@@ -180,6 +180,7 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None:
             _log(f"Added files from {path} to dirs_and_files: {dirs_and_files}")
         file_list = []
         with Manager() as manager:
+            # TODO: re-evaluate dicts/for loops
             lock = manager.Lock()
             for path, rel_paths in dirs_and_files.items():
                 for rel_path, files in rel_paths.items():
@@ -203,6 +204,7 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None:
             self.errors.extend(data_found_one)
 
     def _find_duplicates(self, dirs_and_files):
+        # TODO: re-evaluate dicts/for loops
         for data_path, sub_dirs in dirs_and_files.items():
             # Creates a dict of filenames to list of full filepaths for each
             # fastq file in a given data_path (dataset dir).

From f69bf7db12ead2fba59ce200b85c92dbfba4530d Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Tue, 23 Jan 2024 14:22:10 -0500
Subject: [PATCH 17/30] adding linting and formatting tools

---
 .flake8              |  3 +++
 pyproject.toml       |  7 +++++++
 requirements-dev.txt | 10 ++++++++++
 3 files changed, 20 insertions(+)
 create mode 100644 .flake8
 create mode 100644 pyproject.toml
 create mode 100644 requirements-dev.txt

diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..ad98539
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,3 @@
+[flake8]
+max-line-length = 100
+extend-ignore = E203
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..72dd8d9
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,7 @@
+[tool.black]
+line-length = 99
+
+[tool.isort]
+profile = "black"
+multi_line_output = 3
+src_paths = ["src/ingest-validation-tests"]
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..d5c087a
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,10 @@
+black==23.12.1
+flake8==7.0.0
+git+https://github.com/hubmapconsortium/fastq-utils.git@v0.2.5#egg=hubmap-fastq-utils
+imagecodecs>=2023.3.16
+isort==5.13.2
+jsonschema==4.4.0
+pandas>=1.2.0
+python-frontmatter>=1.0.0
+tifffile==2020.10.1
+xmlschema>=1.6

From 930fd015a671cdadd51520c8024d39ab8b4fe71d Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Thu, 1 Feb 2024 11:35:10 -0500
Subject: [PATCH 18/30] ran black / flake8 / isort, fixed errors, updated
 .gitignore

---
 .gitignore                                    |   1 +
 .../codex_common_errors_validator.py          | 104 +++++++---------
 .../codex_json_validator.py                   |   8 +-
 .../fastq_validator.py                        |   6 +-
 .../fastq_validator_logic.py                  | 107 ++++++++--------
 src/ingest_validation_tests/gz_validator.py   |   4 +-
 .../publication_validator.py                  |  63 ++++++----
 src/ingest_validation_tests/tiff_validator.py |  12 +-
 tests/pytest_runner.py                        |  18 +--
 tests/test_codex_common_errors_validator.py   | 107 ++++++++++------
 tests/test_codex_json_validator.py            |  25 ++--
 tests/test_fastq_validator_logic.py           | 116 ++++++++----------
 tests/test_gz_validator.py                    |  19 +--
 tests/test_ome_tiff_validator.py              |  23 ++--
 tests/test_publication_validator.py           |  56 +++++----
 tests/test_tiff_validator.py                  |  32 +++--
 16 files changed, 381 insertions(+), 320 deletions(-)

diff --git a/.gitignore b/.gitignore
index 916db7b..da9fde3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -105,6 +105,7 @@ celerybeat.pid
 
 # Environments
 .env
+.envrc
 .venv
 env/
 venv/
diff --git a/src/ingest_validation_tests/codex_common_errors_validator.py b/src/ingest_validation_tests/codex_common_errors_validator.py
index 391e9cc..5edbec8 100644
--- a/src/ingest_validation_tests/codex_common_errors_validator.py
+++ b/src/ingest_validation_tests/codex_common_errors_validator.py
@@ -20,22 +20,20 @@ def _split_cycle_dir_string(cycle_str):
     """
     Given a cycle-and-region directory name, split out the cyle and region numbers
     """
-    words = cycle_str.split('_')
+    words = cycle_str.split("_")
     assert len(words) >= 2, f'Directory string "{cycle_str}" has unexpected form'
-    assert words[0].startswith('cyc'), (f'directory string "{cycle_str}" does'
-                                        ' not start with "cyc"')
+    assert words[0].startswith("cyc"), (
+        f'directory string "{cycle_str}" does' ' not start with "cyc"'
+    )
     try:
-        cyc_id = int(words[0][len('cyc'):])
+        cyc_id = int(words[0][len("cyc") :])
     except ValueError:
-        raise AssertionError(f'Directory string "{cycle_str}" cycle number is'
-                             ' not an integer')
-    assert words[1].startswith('reg'), (f'Directory string "{cycle_str}" does'
-                                        ' not include "_reg"')
+        raise AssertionError(f'Directory string "{cycle_str}" cycle number is' " not an integer")
+    assert words[1].startswith("reg"), f'Directory string "{cycle_str}" does' ' not include "_reg"'
     try:
-        reg_id = int(words[1][len('reg'):])
+        reg_id = int(words[1][len("reg") :])
     except ValueError:
-        raise AssertionError(f'Directory string "{cycle_str}" region number is'
-                             ' not an integer')
+        raise AssertionError(f'Directory string "{cycle_str}" region number is' " not an integer")
     return cyc_id, reg_id
 
 
@@ -53,7 +51,7 @@ def collect_errors(self, **kwargs) -> List[str]:
         Return the errors found by this validator
         """
         del kwargs
-        if self.assay_type != 'CODEX':
+        if self.assay_type != "CODEX":
             return []  # We only test CODEX data
         rslts = []
         for path in self.paths:
@@ -61,101 +59,95 @@ def collect_errors(self, **kwargs) -> List[str]:
             try:
                 # is the raw/src_ directory present?
                 prefix = None
-                if (path / 'raw').is_dir():
-                    prefix = path / 'raw'
+                if (path / "raw").is_dir():
+                    prefix = path / "raw"
                 else:
-                    for candidate in path.glob('src_*'):
+                    for candidate in path.glob("src_*"):
                         prefix = candidate
                 if prefix is None:
-                    rslt.append('The raw/src_ subdirectory is missing?')
+                    rslt.append("The raw/src_ subdirectory is missing?")
                     raise QuitNowException()
 
                 # Does dataset.json exist?  If so, 'new CODEX' syntax rules
                 # are in effect
                 dataset_json_exists = False
                 any_dataset_json_exists = False
-                for candidate in path.glob('**/dataset.json'):
+                for candidate in path.glob("**/dataset.json"):
                     any_dataset_json_exists = True
-                    if candidate == prefix / 'dataset.json':
+                    if candidate == prefix / "dataset.json":
                         dataset_json_exists = True
                 if dataset_json_exists:
-                    print('FOUND dataset.json; skipping further analysis')
+                    print("FOUND dataset.json; skipping further analysis")
                     raise QuitNowException()
                 elif any_dataset_json_exists:
-                    rslt.append(
-                        'A dataset.json file exists but'
-                        ' is in the wrong place'
-                    )
+                    rslt.append("A dataset.json file exists but" " is in the wrong place")
 
                 # is the segmentation.json file on the right side?
                 found = False
                 right_place = False
-                for filepath in path.glob('*/[Ss]egmentation.json'):
+                for filepath in path.glob("*/[Ss]egmentation.json"):
                     rel_path = filepath.relative_to(path)
                     found = True
-                    if str(rel_path).startswith(('raw', 'src_')):
+                    if str(rel_path).startswith(("raw", "src_")):
                         right_place = True
                 if found:
                     if right_place:
                         pass
                     else:
-                        rslt.append(
-                            'The segmentation.json file is in the wrong subdirectory'
-                        )
+                        rslt.append("The segmentation.json file is in the wrong subdirectory")
                 else:
-                    rslt.append('The segmentation.json file is missing or misplaced')
+                    rslt.append("The segmentation.json file is missing or misplaced")
 
                 # Does the channelnames.txt file exist?
-                channelnames_txt_path = prefix / 'channelnames.txt'
+                channelnames_txt_path = prefix / "channelnames.txt"
                 if not channelnames_txt_path.is_file():
                     # sometimes we see this variant
-                    channelnames_txt_path = prefix / 'channelNames.txt'
+                    channelnames_txt_path = prefix / "channelNames.txt"
                     if not channelnames_txt_path.is_file():
-                        rslt.append('channelnames.txt is missing')
+                        rslt.append("channelnames.txt is missing")
                         raise QuitNowException()
 
                 # Parse channelnames.txt into a dataframe
                 try:
                     cn_df = pd.read_csv(str(channelnames_txt_path), header=None)
                 except Exception:
-                    rslt.append(f'Unexpected error reading {channelnames_txt_path}')
+                    rslt.append(f"Unexpected error reading {channelnames_txt_path}")
                     raise QuitNowException()
                 if len(cn_df.columns) != 1:
-                    rslt.append(f'Unexpected format for {channelnames_txt_path}')
+                    rslt.append(f"Unexpected format for {channelnames_txt_path}")
                     raise QuitNowException()
 
                 # Does the channelnames_report.csv file exist?
-                report_csv_path = prefix / 'channelnames_report.csv'
+                report_csv_path = prefix / "channelnames_report.csv"
                 if report_csv_path.is_file():
                     # Parse channelnames_report.txt into a dataframe
                     try:
-                        rpt_df = pd.read_csv(str(report_csv_path), sep=',', header=None)
+                        rpt_df = pd.read_csv(str(report_csv_path), sep=",", header=None)
                     except Exception:
-                        rslt.append(f'Unexpected error reading {report_csv_path}')
+                        rslt.append(f"Unexpected error reading {report_csv_path}")
                         raise QuitNowException()
                     if len(rpt_df) == len(cn_df) + 1:
                         # channelnames_report.csv appears to have a header
                         try:
-                            rpt_df = pd.read_csv(str(report_csv_path), sep=',')
+                            rpt_df = pd.read_csv(str(report_csv_path), sep=",")
                         except Exception:
-                            rslt.append(f'Unexpected error reading {report_csv_path}')
+                            rslt.append(f"Unexpected error reading {report_csv_path}")
                             raise QuitNowException()
                     if len(rpt_df.columns) != 2:
                         rslt.append(
-                            f'Could not parse {report_csv_path}.'
-                            ' Is it a comma-separated table?'
+                            f"Could not parse {report_csv_path}." " Is it a comma-separated table?"
                         )
                         raise QuitNowException()
                     col_0, col_1 = rpt_df.columns
-                    rpt_df = rpt_df.rename(columns={col_0: 'Marker', col_1: 'Result'})
+                    rpt_df = rpt_df.rename(columns={col_0: "Marker", col_1: "Result"})
                     # Do they match?
-                    rpt_df['other'] = cn_df[0]
-                    mismatches_df = rpt_df[rpt_df['other'] != rpt_df['Marker']]
+                    rpt_df["other"] = cn_df[0]
+                    mismatches_df = rpt_df[rpt_df["other"] != rpt_df["Marker"]]
                     if len(mismatches_df) != 0:
                         for idx, row in mismatches_df.iterrows():
                             rslt.append(
-                                f'{channelnames_txt_path.name} does not'
-                                ' match channelnames_report.txt'
+                                f"{channelnames_txt_path.name} does not"
+                                " match channelnames_report.txt"
                                 f' on line {idx}: {row["other"]} vs {row["Marker"]}'
                             )
                         raise QuitNowException()
@@ -164,7 +156,7 @@ def collect_errors(self, **kwargs) -> List[str]:
 
                 # Tabulate the cycle and region info
                 all_cycle_dirs = []
-                for glob_str in ['cyc*', 'Cyc*']:
+                for glob_str in ["cyc*", "Cyc*"]:
                     for pth in prefix.glob(glob_str):
                         if pth.is_dir():
                             all_cycle_dirs.append(str(pth.stem).lower())
@@ -189,28 +181,26 @@ def collect_errors(self, **kwargs) -> List[str]:
                 failures = []
                 # First cycle must be 1
                 if cycles[0] != 1:
-                    failures.append('Cycle numbering does not start at 1')
+                    failures.append("Cycle numbering does not start at 1")
                 # First region must be 1
                 if regions[0] != 1:
-                    failures.append('Region numbering does not start at 1')
+                    failures.append("Region numbering does not start at 1")
                 # Cycle range must be contiguous ints
                 if cycles != list(range(cycles[0], cycles[-1] + 1)):
-                    failures.append('Cycle numbers are not contiguous')
+                    failures.append("Cycle numbers are not contiguous")
                 # Region range must be contiguous ints
                 if regions != list(range(regions[0], regions[-1] + 1)):
-                    failures.append('Region numbers are not contiguous')
+                    failures.append("Region numbers are not contiguous")
                 # All cycle, region pairs must be present
                 if len(cycles) * len(regions) != total_entries:
-                    failures.append('Not all cycle/region pairs are present')
+                    failures.append("Not all cycle/region pairs are present")
                 # Total number of channels / total number of cycles must be integer,
                 # excluding any HandE channels
                 total_channel_count = len(cn_df)
-                h_and_e_channel_count = len(cn_df[cn_df[0].str.startswith('HandE')])
-                channels_per_cycle = (
-                    total_channel_count - h_and_e_channel_count
-                ) / len(cycles)
+                h_and_e_channel_count = len(cn_df[cn_df[0].str.startswith("HandE")])
+                channels_per_cycle = (total_channel_count - h_and_e_channel_count) / len(cycles)
                 if channels_per_cycle != int(channels_per_cycle):
-                    failures.append('The number of channels per cycle is not constant')
+                    failures.append("The number of channels per cycle is not constant")
                 if failures:
                     rslt += failures
                     raise QuitNowException()
diff --git a/src/ingest_validation_tests/codex_json_validator.py b/src/ingest_validation_tests/codex_json_validator.py
index 1e9e9ba..41a6955 100644
--- a/src/ingest_validation_tests/codex_json_validator.py
+++ b/src/ingest_validation_tests/codex_json_validator.py
@@ -12,19 +12,19 @@ class CodexJsonValidator(Validator):
 
     def collect_errors(self, **kwargs) -> List[str]:
         del kwargs
-        if 'codex' not in self.assay_type.lower():
+        if "codex" not in self.assay_type.lower():
             return []
 
-        schema_path = Path(__file__).parent / 'codex_schema.json'
+        schema_path = Path(__file__).parent / "codex_schema.json"
         schema = json.loads(schema_path.read_text())
 
         rslt = []
-        for glob_expr in ['**/dataset.json']:
+        for glob_expr in ["**/dataset.json"]:
             for path in self.paths:
                 for file in path.glob(glob_expr):
                     instance = json.loads(file.read_text())
                     try:
                         validate(instance=instance, schema=schema)
                     except Exception as e:
-                        rslt.append(f'{file}: {e}')
+                        rslt.append(f"{file}: {e}")
         return rslt
diff --git a/src/ingest_validation_tests/fastq_validator.py b/src/ingest_validation_tests/fastq_validator.py
index 0a30e42..d31335f 100644
--- a/src/ingest_validation_tests/fastq_validator.py
+++ b/src/ingest_validation_tests/fastq_validator.py
@@ -1,8 +1,8 @@
 from os import cpu_count
 from typing import List
 
-from ingest_validation_tools.plugin_validator import Validator
 from fastq_validator_logic import FASTQValidatorLogic, _log
+from ingest_validation_tools.plugin_validator import Validator
 
 
 class FASTQValidator(Validator):
@@ -10,8 +10,8 @@ class FASTQValidator(Validator):
     cost = 15.0
 
     def collect_errors(self, **kwargs) -> List[str]:
-        threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1
-        _log(f'Threading at {threads}')
+        threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1
+        _log(f"Threading at {threads}")
         validator = FASTQValidatorLogic(verbose=True)
         validator.validate_fastq_files_in_path(self.paths, threads)
         return validator.errors
diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py
index 85eeb44..8095b74 100644
--- a/src/ingest_validation_tests/fastq_validator_logic.py
+++ b/src/ingest_validation_tests/fastq_validator_logic.py
@@ -15,10 +15,7 @@ def is_valid_filename(filename: str) -> bool:
 
 
 def _open_fastq_file(file: Path) -> TextIO:
-    return (
-        gzip.open(file, 'rt') if file.name.endswith('.gz')
-        else file.open()
-    )
+    return gzip.open(file, "rt") if file.name.endswith(".gz") else file.open()
 
 
 def _log(message: str) -> str:
@@ -54,13 +51,13 @@ class FASTQValidatorLogic:
     contain the same number of symbols as letters in the sequence.
     """
 
-    _FASTQ_LINE_2_VALID_CHARS = 'ACGNT'
+    _FASTQ_LINE_2_VALID_CHARS = "ACGNT"
 
     def __init__(self, verbose=False):
         self.errors: List[str] = []
         self._file_record_counts = Manager().dict()
         self._file_prefix_counts = Manager().dict()
-        self._filename = ''
+        self._filename = ""
         self._line_number = 0
 
         self._verbose = verbose
@@ -80,7 +77,7 @@ def _format_error(self, error: str) -> str:
         return message
 
     def _validate_fastq_line_1(self, line: str) -> List[str]:
-        if not line or line[0] != '@':
+        if not line or line[0] != "@":
             return ["Line does not begin with '@'."]
 
         return []
@@ -89,45 +86,47 @@ def _validate_fastq_line_2(self, line: str) -> List[str]:
         self._line_2_length = len(line)
         self._last_line_2_number = self._line_number
 
-        invalid_chars = ''.join(
-            c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS)
+        invalid_chars = "".join(c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS)
         if invalid_chars:
             return [f"Line contains invalid character(s): {invalid_chars}"]
 
         return []
 
     def _validate_fastq_line_3(self, line: str) -> List[str]:
-        if not line or line[0] != '+':
+        if not line or line[0] != "+":
             return ["Line does not begin with '+'."]
 
         return []
 
     def _validate_fastq_line_4(self, line: str) -> List[str]:
         errors: List[str] = []
-        invalid_chars = ''.join(c for c in line if not 33 <= ord(c) <= 126)
+        invalid_chars = "".join(c for c in line if not 33 <= ord(c) <= 126)
         if invalid_chars:
-            errors.append("Line contains invalid quality character(s): "
-                          f'"{invalid_chars}"')
+            errors.append("Line contains invalid quality character(s): " f'"{invalid_chars}"')
 
         if len(line) != self._line_2_length:
-            errors.append(f"Line contains {len(line)} characters which "
-                          f"does not match line {self._last_line_2_number}'s "
-                          f"{self._line_2_length} characters.")
+            errors.append(
+                f"Line contains {len(line)} characters which "
+                f"does not match line {self._last_line_2_number}'s "
+                f"{self._line_2_length} characters."
+            )
         return errors
 
-    _VALIDATE_FASTQ_LINE_METHODS = {1: _validate_fastq_line_1,
-                                    2: _validate_fastq_line_2,
-                                    3: _validate_fastq_line_3,
-                                    4: _validate_fastq_line_4}
+    _VALIDATE_FASTQ_LINE_METHODS = {
+        1: _validate_fastq_line_1,
+        2: _validate_fastq_line_2,
+        3: _validate_fastq_line_3,
+        4: _validate_fastq_line_4,
+    }
 
     def validate_fastq_record(self, line: str, line_number: int) -> List[str]:
         line_index = line_number % 4 + 1
 
-        validator_method: Callable[[FASTQValidatorLogic, str], List[str]] = \
-            self._VALIDATE_FASTQ_LINE_METHODS[line_index]
+        validator_method: Callable[
+            [FASTQValidatorLogic, str], List[str]
+        ] = self._VALIDATE_FASTQ_LINE_METHODS[line_index]
 
-        assert validator_method, \
-            f"No validator method defined for record index {line_index}"
+        assert validator_method, f"No validator method defined for record index {line_index}"
 
         return validator_method(self, line)
 
@@ -139,8 +138,8 @@ def validate_fastq_stream(self, fastq_data: TextIO) -> int:
         for line_count, line in enumerate(fastq_data):
             self._line_number = line_count + 1
             self.errors.extend(
-                self._format_error(error) for error in
-                self.validate_fastq_record(line.rstrip(), line_count)
+                self._format_error(error)
+                for error in self.validate_fastq_record(line.rstrip(), line_count)
             )
 
         return line_count + 1
@@ -151,9 +150,9 @@ def validate_fastq_file(self, fastq_file: Path) -> None:
 
         if not is_valid_filename(fastq_file.name):
             # If we don't like the filename, don't bother reading the contents.
-            self.errors.append(_log(
-                "Filename does not have proper format "
-                "and will not be processed"))
+            self.errors.append(
+                _log("Filename does not have proper format " "and will not be processed")
+            )
             return
 
         self._line_number = 0
@@ -163,12 +162,10 @@ def validate_fastq_file(self, fastq_file: Path) -> None:
             with _open_fastq_file(fastq_file) as fastq_data:
                 records_read = self.validate_fastq_stream(fastq_data)
         except gzip.BadGzipFile:
-            self.errors.append(
-                self._format_error(f"Bad gzip file: {fastq_file}."))
+            self.errors.append(self._format_error(f"Bad gzip file: {fastq_file}."))
             return
         except IOError:
-            self.errors.append(
-                self._format_error(f"Unable to open FASTQ data file {fastq_file}."))
+            self.errors.append(self._format_error(f"Unable to open FASTQ data file {fastq_file}."))
             return
         self._file_record_counts[str(fastq_file)] = records_read
 
@@ -187,7 +184,9 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None:
                     for file in files:
                         file_list.append(Path(path / rel_path / file))
             try:
-                logging.info(f"Passing file list for paths {paths} to engine. File list: {file_list}.")
+                logging.info(
+                    f"Passing file list for paths {paths} to engine. File list: {file_list}."
+                )
                 pool = Pool(threads)
                 engine = Engine(self)
                 data_output = pool.imap_unordered(engine, file_list)
@@ -214,14 +213,17 @@ def _find_duplicates(self, dirs_and_files):
                     files_per_path[filepath.name].append(data_path / sub_path)
             for filename, filepaths in files_per_path.items():
                 if len(filepaths) > 1:
-                    self.errors.append(_log(
-                        f"{filename} has been found multiple times during this validation. Locations of duplicates: {filepaths}."))  # noqa: E501
+                    self.errors.append(
+                        _log(
+                            f"{filename} has been found multiple times during this validation. Locations of duplicates: {filepaths}."  # noqa: E501
+                        )
+                    )
 
     def _find_shared_prefixes(self, lock):
         # This pattern seeks out the string that includes the lane number (since
         # that is expected to be present to help anchor the prefix) that comes
         # before any of _I1, _I2, _R1, or _R2.
-        fastq_file_prefix_regex = re.compile(r'(.+_L\d+.*)_[IR][12][._]')
+        fastq_file_prefix_regex = re.compile(r"(.+_L\d+.*)_[IR][12][._]")
         for fastq_file, records_read in self._file_record_counts.items():
             match = fastq_file_prefix_regex.match(Path(fastq_file).name)
             with lock:
@@ -233,26 +235,31 @@ def _find_shared_prefixes(self, lock):
                             # Find a file we've validated already that matches this
                             # prefix.
                             extant_files = [
-                                str(Path(filepath).name) for filepath, record_count
-                                in self._file_record_counts.items()
-                                if record_count == extant_count and Path(filepath).name.startswith(filename_prefix)
+                                str(Path(filepath).name)
+                                for filepath, record_count in self._file_record_counts.items()
+                                if record_count == extant_count
+                                and Path(filepath).name.startswith(filename_prefix)
                             ]
                             # Based on how the dictionaries are created, there should
                             # always be at least one matching filename.
                             assert extant_files
 
-                            self.errors.append(_log(
-                                f"{Path(fastq_file).name} ({records_read} lines) "
-                                f"does not match length of {extant_files[0]} "
-                                f"({extant_count} lines)."))
+                            self.errors.append(
+                                _log(
+                                    f"{Path(fastq_file).name} ({records_read} lines) "
+                                    f"does not match length of {extant_files[0]} "
+                                    f"({extant_count} lines)."
+                                )
+                            )
                     else:
                         self._file_prefix_counts[filename_prefix] = records_read
 
 
 def main():
-    parser = argparse.ArgumentParser(description='Validate FASTQ files.')
-    parser.add_argument('filepaths', type=Path, nargs='+',
-                        help="Files to validate for FASTQ syntax")
+    parser = argparse.ArgumentParser(description="Validate FASTQ files.")
+    parser.add_argument(
+        "filepaths", type=Path, nargs="+", help="Files to validate for FASTQ syntax"
+    )
 
     args = parser.parse_args()
     if isinstance(args.filepaths, List):
@@ -262,13 +269,11 @@ def main():
     elif isinstance(args.filepaths, str):
         filepaths = [Path(args.filepaths)]
     else:
-        raise Exception(
-            f"Validator init received base_paths arg as type {type(args.filepaths)}"
-        )
+        raise Exception(f"Validator init received base_paths arg as type {type(args.filepaths)}")
 
     validator = FASTQValidatorLogic(True)
     validator.validate_fastq_files_in_path(filepaths, Lock())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/src/ingest_validation_tests/gz_validator.py b/src/ingest_validation_tests/gz_validator.py
index 58ad671..9bb0162 100644
--- a/src/ingest_validation_tests/gz_validator.py
+++ b/src/ingest_validation_tests/gz_validator.py
@@ -1,9 +1,9 @@
+import gzip
+import re
 from multiprocessing import Pool
 from os import cpu_count
-import re
 from typing import List
 
-import gzip
 from ingest_validation_tools.plugin_validator import Validator
 
 
diff --git a/src/ingest_validation_tests/publication_validator.py b/src/ingest_validation_tests/publication_validator.py
index 4c7ccc1..528a2b5 100644
--- a/src/ingest_validation_tests/publication_validator.py
+++ b/src/ingest_validation_tests/publication_validator.py
@@ -2,10 +2,11 @@
 Test for some common errors in the directory and file structure of publications.
 """
 
-from typing import List
-import re
 import json
+import re
 from pathlib import Path
+from typing import List
+
 import frontmatter
 from ingest_validation_tools.plugin_validator import Validator
 
@@ -15,50 +16,57 @@ class PublicationValidator(Validator):
     Test for some common errors in the directory and file structure of
     publications.
     """
+
     description = "Test for common problems found in publications"
     cost = 1.0
-    base_url_re = r'(\s*\{\{\s*base_url\s*\}\})/(.*)'
-    url_re = r'[Uu][Rr][Ll]'
+    base_url_re = r"(\s*\{\{\s*base_url\s*\}\})/(.*)"
+    url_re = r"[Uu][Rr][Ll]"
 
     def collect_errors(self, **kwargs) -> List[str]:
         """
         Return the errors found by this validator
         """
         del kwargs
-        if self.assay_type != 'Publication':
+        if self.assay_type != "Publication":
             return []  # We only test Publication data
         rslt = []
         for path in self.paths:
             try:
-                vignette_path = path / 'vignettes'
-                assert vignette_path.is_dir(), 'vignettes not found or not a directory'
-                for this_vignette_path in vignette_path.glob('*'):
-                    assert this_vignette_path.is_dir(), (f"Found the non-dir {this_vignette_path}"
-                                                        " in vignettes")
-                    this_vignette_all_paths = set(this_vignette_path.glob('*'))
+                vignette_path = path / "vignettes"
+                assert vignette_path.is_dir(), "vignettes not found or not a directory"
+                for this_vignette_path in vignette_path.glob("*"):
+                    assert this_vignette_path.is_dir(), (
+                        f"Found the non-dir {this_vignette_path}" " in vignettes"
+                    )
+                    this_vignette_all_paths = set(this_vignette_path.glob("*"))
                     if not all(pth.is_file() for pth in this_vignette_all_paths):
-                        raise AssertionError('Found a subdirectory in a vignette')
+                        raise AssertionError("Found a subdirectory in a vignette")
                     md_found = False
                     vig_figures = []
-                    for md_path in this_vignette_path.glob('*.md'):
+                    for md_path in this_vignette_path.glob("*.md"):
                         if md_found:
-                            raise AssertionError('A vignette has more than one markdown file')
+                            raise AssertionError("A vignette has more than one markdown file")
                         else:
                             md_found = True
                         vig_fm = frontmatter.loads(md_path.read_text())
-                        for key in ['name', 'figures']:
-                            assert key in vig_fm.metadata, ('vignette markdown is incorrectly'
-                                                            f' formatted or has no {key}')
-                        for fig_dict in vig_fm.metadata['figures']:
-                            assert 'file' in fig_dict, 'figure dict does not reference a file'
-                            assert 'name' in fig_dict, 'figure dict does not provide a name'
-                            vig_figures.append(fig_dict['file'])
+                        for key in ["name", "figures"]:
+                            assert key in vig_fm.metadata, (
+                                "vignette markdown is incorrectly" f" formatted or has no {key}"
+                            )
+                        for fig_dict in vig_fm.metadata["figures"]:
+                            assert "file" in fig_dict, "figure dict does not reference a file"
+                            assert "name" in fig_dict, "figure dict does not provide a name"
+                            vig_figures.append(fig_dict["file"])
                         this_vignette_all_paths.remove(md_path)
                         for fname in vig_figures:
-                            rslt.extend(self.validate_vitessce_config(this_vignette_path / fname, path))
+                            rslt.extend(
+                                self.validate_vitessce_config(this_vignette_path / fname, path)
+                            )
                             this_vignette_all_paths.remove(this_vignette_path / fname)
-                    assert not this_vignette_all_paths, ('unexpected files in vignette:'
-                                                        f' {list(str(elt) for elt in this_vignette_all_paths)}')
+                    assert not this_vignette_all_paths, (
+                        "unexpected files in vignette:"
+                        f" {list(str(elt) for elt in this_vignette_all_paths)}"
+                    )
 
             except AssertionError as excp:
                 rslt.append(str(excp))
@@ -94,9 +102,10 @@ def validate_vitessce_config(self, json_path, path):
                     match = re.match(self.base_url_re, val)
                     if match:  # it starts with {{ base_url }}
                         extra_url = match.group(2)
-                        data_path = path / 'data' / extra_url
-                        assert data_path.exists(), ("expected data file"
-                                                    f" {Path('data') / extra_url} is absent")
+                        data_path = path / "data" / extra_url
+                        assert data_path.exists(), (
+                            "expected data file" f" {Path('data') / extra_url} is absent"
+                        )
 
                 except AssertionError as excp:
                     rslt.append(str(excp))
diff --git a/src/ingest_validation_tests/tiff_validator.py b/src/ingest_validation_tests/tiff_validator.py
index db840c1..23b568a 100644
--- a/src/ingest_validation_tests/tiff_validator.py
+++ b/src/ingest_validation_tests/tiff_validator.py
@@ -37,13 +37,15 @@ class TiffValidator(Validator):
     cost = 1.0
 
     def collect_errors(self, **kwargs) -> List[str]:
-        threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1
+        threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1
         pool = Pool(threads)
         filenames_to_test = []
-        for glob_expr in ['**/*.tif', '**/*.tiff', '**/*.TIFF', '**/*.TIF']:
+        for glob_expr in ["**/*.tif", "**/*.tiff", "**/*.TIFF", "**/*.TIF"]:
             for path in self.paths:
                 for file in path.glob(glob_expr):
                     filenames_to_test.append(file)
-        return list(rslt for rslt in pool.imap_unordered(_check_tiff_file,
-                                                              filenames_to_test)
-            if rslt is not None)
+        return list(
+            rslt
+            for rslt in pool.imap_unordered(_check_tiff_file, filenames_to_test)
+            if rslt is not None
+        )
diff --git a/tests/pytest_runner.py b/tests/pytest_runner.py
index 0743dad..ac7e5a3 100644
--- a/tests/pytest_runner.py
+++ b/tests/pytest_runner.py
@@ -1,12 +1,15 @@
 import sys
 from pathlib import Path
+
 import pytest
 
-class add_path():
+
+class add_path:
     """
     Add an element to sys.path using a context.
     Thanks to Eugene Yarmash https://stackoverflow.com/a/39855753
     """
+
     def __init__(self, path):
         self.path = path
 
@@ -22,16 +25,13 @@ def __exit__(self, exc_type, exc_value, traceback):
 
 def main():
     if len(sys.argv) != 2:
-        sys.exit(f'usage: {sys.argv[0]} path-to-ingest-validation-tools')
-    tools_path = Path(sys.argv[1]).resolve() / 'src'
-    plugins_path = (Path(__file__).resolve().parent.parent
-                    / 'src'
-                    / 'ingest_validation_tests'
-                    )
+        sys.exit(f"usage: {sys.argv[0]} path-to-ingest-validation-tools")
+    tools_path = Path(sys.argv[1]).resolve() / "src"
+    plugins_path = Path(__file__).resolve().parent.parent / "src" / "ingest_validation_tests"
     with add_path(str(tools_path)):
         with add_path(str(plugins_path)):
-            sys.exit(pytest.main(['-vv']))
+            sys.exit(pytest.main(["-vv"]))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/tests/test_codex_common_errors_validator.py b/tests/test_codex_common_errors_validator.py
index 23cd5e7..68d1d82 100644
--- a/tests/test_codex_common_errors_validator.py
+++ b/tests/test_codex_common_errors_validator.py
@@ -1,53 +1,84 @@
-from pathlib import Path
 import zipfile
+from pathlib import Path
 
 import pytest
 
-@pytest.mark.parametrize(('test_data_fname', 'msg_starts_list'), (
-    ('test_data/fake_codex_tree_0.zip', ['Unexpected error reading']),
-    ('test_data/fake_codex_tree_1.zip', ['The segmentation.json file is in',
-                                         'Unexpected error reading']),
-    ('test_data/fake_codex_tree_2.zip', ['The raw/src_ subdirectory is missing?']),
-    ('test_data/fake_codex_tree_3.zip', ['channelnames.txt is missing']),
-    ('test_data/fake_codex_tree_4.zip', ['Unexpected error reading']),
-    ('test_data/fake_codex_tree_5.zip', ['channelnames.txt does not match channelnames_report.txt'
-                                         ' on line 1: HLADR vs HLA-DR',
-                                         'channelnames.txt does not match channelnames_report.txt'
-                                         ' on line 6: Empty vs Blank']),
-    ('test_data/fake_codex_tree_6.zip', ['Could not parse ']),
-    ('test_data/fake_codex_tree_7.zip', []),
-    ('test_data/fake_codex_tree_8.zip', ['Region numbers are not contiguous']),
-    ('test_data/fake_codex_tree_9.zip', ['Cycle numbers are not contiguous',
-                                         'The number of channels per cycle is not constant']),
-    ('test_data/fake_codex_tree_10.zip', ['Directory string "cyc0a3_reg001_211119_040351"'
-                                          ' cycle number is not an integer']),
-    ('test_data/fake_codex_tree_11.zip', ['Directory string "cyc003_reg0a1_211119_040351"'
-                                          ' region number is not an integer']),
-    ('test_data/fake_codex_tree_12.zip', ['Directory string "cyc002_rig001_211119_040351"'
-                                          ' does not include "_reg"']),
-    ('test_data/fake_codex_tree_13.zip', ['Cycle numbering does not start at 1']),
-    ('test_data/fake_codex_tree_14.zip', ['Region numbering does not start at 1']),
-    ('test_data/fake_codex_tree_15.zip', ['Not all cycle/region pairs are present',
-                                          'The number of channels per cycle is not constant']),
-    ('test_data/fake_codex_tree_16.zip', []),
-    ('test_data/fake_codex_tree_17.zip', ['A dataset.json file exists but is in the wrong place',
-                                          'Region numbering does not start at 1']),
-    ('test_data/fake_codex_tree_18.zip', ['The number of channels per cycle is not constant']),
-    ('test_data/fake_codex_tree_19.zip', []),
-    ))
+
+@pytest.mark.parametrize(
+    ("test_data_fname", "msg_starts_list"),
+    (
+        ("test_data/fake_codex_tree_0.zip", ["Unexpected error reading"]),
+        (
+            "test_data/fake_codex_tree_1.zip",
+            ["The segmentation.json file is in", "Unexpected error reading"],
+        ),
+        ("test_data/fake_codex_tree_2.zip", ["The raw/src_ subdirectory is missing?"]),
+        ("test_data/fake_codex_tree_3.zip", ["channelnames.txt is missing"]),
+        ("test_data/fake_codex_tree_4.zip", ["Unexpected error reading"]),
+        (
+            "test_data/fake_codex_tree_5.zip",
+            [
+                "channelnames.txt does not match channelnames_report.txt"
+                " on line 1: HLADR vs HLA-DR",
+                "channelnames.txt does not match channelnames_report.txt"
+                " on line 6: Empty vs Blank",
+            ],
+        ),
+        ("test_data/fake_codex_tree_6.zip", ["Could not parse "]),
+        ("test_data/fake_codex_tree_7.zip", []),
+        ("test_data/fake_codex_tree_8.zip", ["Region numbers are not contiguous"]),
+        (
+            "test_data/fake_codex_tree_9.zip",
+            [
+                "Cycle numbers are not contiguous",
+                "The number of channels per cycle is not constant",
+            ],
+        ),
+        (
+            "test_data/fake_codex_tree_10.zip",
+            ['Directory string "cyc0a3_reg001_211119_040351"' " cycle number is not an integer"],
+        ),
+        (
+            "test_data/fake_codex_tree_11.zip",
+            ['Directory string "cyc003_reg0a1_211119_040351"' " region number is not an integer"],
+        ),
+        (
+            "test_data/fake_codex_tree_12.zip",
+            ['Directory string "cyc002_rig001_211119_040351"' ' does not include "_reg"'],
+        ),
+        ("test_data/fake_codex_tree_13.zip", ["Cycle numbering does not start at 1"]),
+        ("test_data/fake_codex_tree_14.zip", ["Region numbering does not start at 1"]),
+        (
+            "test_data/fake_codex_tree_15.zip",
+            [
+                "Not all cycle/region pairs are present",
+                "The number of channels per cycle is not constant",
+            ],
+        ),
+        ("test_data/fake_codex_tree_16.zip", []),
+        (
+            "test_data/fake_codex_tree_17.zip",
+            [
+                "A dataset.json file exists but is in the wrong place",
+                "Region numbering does not start at 1",
+            ],
+        ),
+        ("test_data/fake_codex_tree_18.zip", ["The number of channels per cycle is not constant"]),
+        ("test_data/fake_codex_tree_19.zip", []),
+    ),
+)
 def test_codex_common_errors_validator(test_data_fname, msg_starts_list, tmp_path):
     from codex_common_errors_validator import CodexCommonErrorsValidator
+
     test_data_path = Path(test_data_fname)
     zfile = zipfile.ZipFile(test_data_path)
     zfile.extractall(tmp_path)
-    validator = CodexCommonErrorsValidator([Path(tmp_path / test_data_path.stem)],
-                                           'CODEX'
-                                           )
+    validator = CodexCommonErrorsValidator([Path(tmp_path / test_data_path.stem)], "CODEX")
     errors = validator.collect_errors()[:]
-    print(f'ERRORS FOLLOW FOR {test_data_fname}')
+    print(f"ERRORS FOLLOW FOR {test_data_fname}")
     for err in errors:
         print(err)
-    print('ERRORS ABOVE')
+    print("ERRORS ABOVE")
     assert len(msg_starts_list) == len(errors)
     for err_str, expected_str in zip(errors, msg_starts_list):
         assert err_str.startswith(expected_str)
diff --git a/tests/test_codex_json_validator.py b/tests/test_codex_json_validator.py
index a2998ff..4539270 100644
--- a/tests/test_codex_json_validator.py
+++ b/tests/test_codex_json_validator.py
@@ -1,22 +1,27 @@
-from pathlib import Path
-import zipfile
 import re
+import zipfile
+from pathlib import Path
 
 import pytest
 
-@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), (
-    ('test_data/good_codex_akoya_directory_v1_with_dataset_json_fails.zip',
-     [".*is not of type 'object'.*"]),
-    ('test_data/good_codex_akoya_directory_v1_with_dataset_json_passes.zip', []),
-    ))
+
+@pytest.mark.parametrize(
+    ("test_data_fname", "msg_re_list"),
+    (
+        (
+            "test_data/good_codex_akoya_directory_v1_with_dataset_json_fails.zip",
+            [".*is not of type 'object'.*"],
+        ),
+        ("test_data/good_codex_akoya_directory_v1_with_dataset_json_passes.zip", []),
+    ),
+)
 def test_codex_json_validator(test_data_fname, msg_re_list, tmp_path):
     from codex_json_validator import CodexJsonValidator
+
     test_data_path = Path(test_data_fname)
     zfile = zipfile.ZipFile(test_data_path)
     zfile.extractall(tmp_path)
-    validator = CodexJsonValidator(tmp_path / test_data_path.stem,
-                                   'CODEX'
-                                   )
+    validator = CodexJsonValidator(tmp_path / test_data_path.stem, "CODEX")
     errors = validator.collect_errors()[:]
     assert len(msg_re_list) == len(errors)
     for err_str, expected_re in zip(errors, msg_re_list):
diff --git a/tests/test_fastq_validator_logic.py b/tests/test_fastq_validator_logic.py
index a244cd8..c554d6f 100644
--- a/tests/test_fastq_validator_logic.py
+++ b/tests/test_fastq_validator_logic.py
@@ -1,36 +1,30 @@
-from multiprocessing import Lock
+import gzip
 from pathlib import Path
 from typing import TextIO
 
-import gzip
 import pytest
+from src.ingest_validation_tests.fastq_validator_logic import FASTQValidatorLogic
 
-from src.ingest_validation_tests.fastq_validator_logic import \
-    FASTQValidatorLogic
-
-_GOOD_RECORDS = '''\
+_GOOD_RECORDS = """\
 @A12345:123:A12BCDEFG:1:1234:1000:1234 1:N:0:NACTGACTGA+CTGACTGACT
 NACTGACTGA
 +
 #FFFFFFFFF
-'''
+"""
 
 _GOOD_QUALITY_RECORD = (
-    '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-    r'[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'
+    "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    r"[\]^_`abcdefghijklmnopqrstuvwxyz{|}~"
 )
 
 _GOOD_SEQUENCE_FOR_QUALITY = (
-    'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'
-    'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'
+    "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
+    "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
 )
 
 
 def _open_output_file(filename: Path, use_gzip: bool) -> TextIO:
-    return (
-        gzip.open(filename, 'wt') if use_gzip
-        else open(filename, 'wt')
-    )
+    return gzip.open(filename, "wt") if use_gzip else open(filename, "wt")
 
 
 class TestFASTQValidatorLogic:
@@ -46,33 +40,29 @@ def test_fastq_validator_no_files(self, fastq_validator, tmp_path):
     def test_fastq_validator_bad_gzip_data(self, fastq_validator, tmp_path):
         # Note that the filename ends in .gz, although it will not contain
         # compressed data.
-        test_file = tmp_path.joinpath('test.fastq.gz')
+        test_file = tmp_path.joinpath("test.fastq.gz")
         with _open_output_file(test_file, False) as output:
             output.write(_GOOD_RECORDS)
 
         fastq_validator.validate_fastq_file(test_file)
         assert "Bad gzip file" in fastq_validator.errors[0]
 
-    def test_fastq_validator_unrecognized_file(self, fastq_validator,
-                                               tmp_path):
-        test_file = tmp_path.joinpath('test.txt')
+    def test_fastq_validator_unrecognized_file(self, fastq_validator, tmp_path):
+        test_file = tmp_path.joinpath("test.txt")
         with _open_output_file(test_file, False) as output:
             output.write(_GOOD_RECORDS)
 
         fastq_validator.validate_fastq_file(test_file)
-        assert "Filename does not have proper format" in \
-               fastq_validator.errors[0]
+        assert "Filename does not have proper format" in fastq_validator.errors[0]
 
-    def test_fastq_validator_empty_directory(self, fastq_validator,
-                                             tmp_path):
+    def test_fastq_validator_empty_directory(self, fastq_validator, tmp_path):
         fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
         # No files in path means no errors
         assert fastq_validator.errors == []
 
     @pytest.mark.parametrize("use_gzip", [False, True])
     def test_fastq_validator_basic(self, fastq_validator, tmp_path, use_gzip):
-        test_file = tmp_path.joinpath('test.fastq.gz' if use_gzip
-                                      else 'test.fastq')
+        test_file = tmp_path.joinpath("test.fastq.gz" if use_gzip else "test.fastq")
         with _open_output_file(test_file, use_gzip) as output:
             output.write(_GOOD_RECORDS)
 
@@ -80,9 +70,9 @@ def test_fastq_validator_basic(self, fastq_validator, tmp_path, use_gzip):
         assert not fastq_validator.errors
 
     def test_fastq_validator_bad_file(self, fastq_validator, tmp_path):
-        test_file = tmp_path.joinpath('test.fastq')
+        test_file = tmp_path.joinpath("test.fastq")
         with _open_output_file(test_file, False) as output:
-            output.write('ABCDEF')
+            output.write("ABCDEF")
 
         fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
 
@@ -91,56 +81,54 @@ def test_fastq_validator_bad_file(self, fastq_validator, tmp_path):
         assert fastq_validator.errors
 
     def test_fastq_validator_duplicate_file(self, fastq_validator, tmp_path):
-        for subdirectory in ['a', 'b']:
+        for subdirectory in ["a", "b"]:
             subdirectory_path = tmp_path.joinpath(subdirectory)
             subdirectory_path.mkdir()
-            with _open_output_file(subdirectory_path.joinpath('test.fastq'),
-                                   False) as output:
+            with _open_output_file(subdirectory_path.joinpath("test.fastq"), False) as output:
                 output.write(_GOOD_RECORDS)
 
         fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
-        assert "test.fastq has been found multiple times" in \
-               fastq_validator.errors[0]
+        assert "test.fastq has been found multiple times" in fastq_validator.errors[0]
 
     def test_fastq_validator_io_error(self, fastq_validator, tmp_path):
-        fake_path = tmp_path.joinpath('does-not-exist.fastq')
+        fake_path = tmp_path.joinpath("does-not-exist.fastq")
 
         fastq_validator.validate_fastq_file(fake_path)
 
         assert "Unable to open" in fastq_validator.errors[0]
 
     def test_fastq_validator_line_1_good(self, fastq_validator):
-        result = fastq_validator.validate_fastq_record('@SEQ_ID', 0)
+        result = fastq_validator.validate_fastq_record("@SEQ_ID", 0)
 
         assert not result
 
     def test_fastq_validator_line_1_bad(self, fastq_validator):
-        result = fastq_validator.validate_fastq_record('*SEQ_ID', 0)
+        result = fastq_validator.validate_fastq_record("*SEQ_ID", 0)
 
         assert "does not begin with '@'" in result[0]
 
     def test_fastq_validator_line_1_empty(self, fastq_validator):
-        result = fastq_validator.validate_fastq_record('', 0)
+        result = fastq_validator.validate_fastq_record("", 0)
 
         assert "does not begin with '@'" in result[0]
 
     def test_fastq_validator_line_2_good(self, fastq_validator):
-        result = fastq_validator.validate_fastq_record('ACTGACTGACTGNNNN', 1)
+        result = fastq_validator.validate_fastq_record("ACTGACTGACTGNNNN", 1)
 
         assert not result
 
     def test_fastq_validator_line_2_bad(self, fastq_validator):
-        result = fastq_validator.validate_fastq_record('ACTGACT$ACTGNNNN', 1)
+        result = fastq_validator.validate_fastq_record("ACTGACT$ACTGNNNN", 1)
 
         assert "contains invalid character(s): $" in result[0]
 
     def test_fastq_validator_line_3_good(self, fastq_validator):
-        result = fastq_validator.validate_fastq_record('+SEQ_ID', 2)
+        result = fastq_validator.validate_fastq_record("+SEQ_ID", 2)
 
         assert not result
 
     def test_fastq_validator_line_3_bad(self, fastq_validator):
-        result = fastq_validator.validate_fastq_record('!SEQ_ID', 2)
+        result = fastq_validator.validate_fastq_record("!SEQ_ID", 2)
 
         assert "does not begin with '+'" in result[0]
 
@@ -151,42 +139,41 @@ def test_fastq_validator_line_4_good(self, fastq_validator):
         assert not result
 
     def test_fastq_validator_line_4_bad(self, fastq_validator):
-        fastq_validator.validate_fastq_record('1234567', 1)
-        result = fastq_validator.validate_fastq_record('ABC !@#', 3)
+        fastq_validator.validate_fastq_record("1234567", 1)
+        result = fastq_validator.validate_fastq_record("ABC !@#", 3)
 
         assert 'contains invalid quality character(s): " "' in result[0]
 
     def test_fastq_validator_line_4_matching_length(self, fastq_validator):
-        fastq_validator.validate_fastq_record('1234567', 1)
-        result = fastq_validator.validate_fastq_record('ABCDEFG', 3)
+        fastq_validator.validate_fastq_record("1234567", 1)
+        result = fastq_validator.validate_fastq_record("ABCDEFG", 3)
 
         assert not result
 
-    def test_fastq_validator_line_4_mismatched_length(self, fastq_validator,
-                                                      tmp_path):
-        fastq_validator.validate_fastq_record('123456789ABCDEF', 1)
-        fastq_validator.validate_fastq_record('ABC', 3)
+    def test_fastq_validator_line_4_mismatched_length(self, fastq_validator, tmp_path):
+        fastq_validator.validate_fastq_record("123456789ABCDEF", 1)
+        fastq_validator.validate_fastq_record("ABC", 3)
 
-        test_data = '''\
+        test_data = """\
 @A12345:123:A12BCDEFG:1:1234:1000:1234 1:N:0:NACTGACTGA+CTGACTGACT
 NACTGACTGA
 +
 #FFFFFFFF
-'''
+"""
 
-        new_file = tmp_path.joinpath('test.fastq')
+        new_file = tmp_path.joinpath("test.fastq")
         with _open_output_file(new_file, False) as output:
             output.write(test_data)
 
         fastq_validator.validate_fastq_file(new_file)
-        assert "contains 9 characters which does not match line 2's 10" in \
-               fastq_validator.errors[0]
+        assert (
+            "contains 9 characters which does not match line 2's 10" in fastq_validator.errors[0]
+        )
 
-    def test_fastq_validator_record_counts_good(self, fastq_validator,
-                                                tmp_path):
+    def test_fastq_validator_record_counts_good(self, fastq_validator, tmp_path):
         for filename in [
-            'SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq',
-            'SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq'
+            "SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq",
+            "SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq",
         ]:
             new_file = tmp_path.joinpath(filename)
             with _open_output_file(new_file, False) as output:
@@ -196,15 +183,14 @@ def test_fastq_validator_record_counts_good(self, fastq_validator,
 
         assert not fastq_validator.errors
 
-    def test_fastq_validator_record_counts_bad(self, fastq_validator,
-                                               tmp_path):
-        with _open_output_file(tmp_path.joinpath(
-                'SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq'),
-                False) as output:
+    def test_fastq_validator_record_counts_bad(self, fastq_validator, tmp_path):
+        with _open_output_file(
+            tmp_path.joinpath("SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq"), False
+        ) as output:
             output.write(_GOOD_RECORDS)
-        with _open_output_file(tmp_path.joinpath(
-                'SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq'),
-                False) as output:
+        with _open_output_file(
+            tmp_path.joinpath("SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq"), False
+        ) as output:
             output.write(_GOOD_RECORDS)
             output.write(_GOOD_RECORDS)
 
diff --git a/tests/test_gz_validator.py b/tests/test_gz_validator.py
index 0c6c8b2..ad748c4 100644
--- a/tests/test_gz_validator.py
+++ b/tests/test_gz_validator.py
@@ -1,19 +1,24 @@
-from pathlib import Path
-import zipfile
 import re
+import zipfile
+from pathlib import Path
 
 import pytest
 
-@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), (
-    ('test_data/fake_snrnaseq_tree_good.zip', []),
-    ('test_data/fake_snrnaseq_tree_bad.zip', ['.*text2.txt.gz is not a valid gzipped file']),
-    ))
+
+@pytest.mark.parametrize(
+    ("test_data_fname", "msg_re_list"),
+    (
+        ("test_data/fake_snrnaseq_tree_good.zip", []),
+        ("test_data/fake_snrnaseq_tree_bad.zip", [".*text2.txt.gz is not a valid gzipped file"]),
+    ),
+)
 def test_gz_validator(test_data_fname, msg_re_list, tmp_path):
     from gz_validator import GZValidator
+
     test_data_path = Path(test_data_fname)
     zfile = zipfile.ZipFile(test_data_path)
     zfile.extractall(tmp_path)
-    validator = GZValidator(tmp_path / test_data_path.stem, 'snRNAseq')
+    validator = GZValidator(tmp_path / test_data_path.stem, "snRNAseq")
     errors = validator.collect_errors(coreuse=4)[:]
     assert len(msg_re_list) == len(errors)
     for err_str, re_str in zip(errors, msg_re_list):
diff --git a/tests/test_ome_tiff_validator.py b/tests/test_ome_tiff_validator.py
index 09e2163..89ad198 100644
--- a/tests/test_ome_tiff_validator.py
+++ b/tests/test_ome_tiff_validator.py
@@ -1,20 +1,27 @@
-from pathlib import Path
-import zipfile
 import re
+import zipfile
+from pathlib import Path
 
 import pytest
 
-@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), (
-    ('test_data/codex_tree_ometiff_bad.zip',
-     ['.*tubhiswt_C0_bad.ome.tif is not a valid OME.TIFF file.*']),
-    ('test_data/codex_tree_ometiff_good.zip',[]),
-    ))
+
+@pytest.mark.parametrize(
+    ("test_data_fname", "msg_re_list"),
+    (
+        (
+            "test_data/codex_tree_ometiff_bad.zip",
+            [".*tubhiswt_C0_bad.ome.tif is not a valid OME.TIFF file.*"],
+        ),
+        ("test_data/codex_tree_ometiff_good.zip", []),
+    ),
+)
 def test_ome_tiff_validator(test_data_fname, msg_re_list, tmp_path):
     from ome_tiff_validator import OmeTiffValidator
+
     test_data_path = Path(test_data_fname)
     zfile = zipfile.ZipFile(test_data_path)
     zfile.extractall(tmp_path)
-    validator = OmeTiffValidator(tmp_path / test_data_path.stem, 'CODEX')
+    validator = OmeTiffValidator(tmp_path / test_data_path.stem, "CODEX")
     errors = validator.collect_errors(coreuse=4)[:]
     assert len(msg_re_list) == len(errors)
     for err_str, re_str in zip(errors, msg_re_list):
diff --git a/tests/test_publication_validator.py b/tests/test_publication_validator.py
index ebb768c..50c3a0e 100644
--- a/tests/test_publication_validator.py
+++ b/tests/test_publication_validator.py
@@ -1,34 +1,46 @@
-from pathlib import Path
-import zipfile
 import re
+import zipfile
+from pathlib import Path
 
 import pytest
 
-@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), (
-    ('test_data/publication_tree_good.zip', []),
-    ('test_data/publication_tree_good_complex.zip', []),
-    ('test_data/publication_tree_bad_complex.zip',
-     [
-         'expected data file data/vignette_12/A/0/325b936e-4132-45fe-8674-9abbde568be8 is absent',
-         'expected data file data/vignette_12/A/0/9db02302-07d9-4c54-ad45-4578c4822cce is absent',
-         'expected data file data/vignette_12/A/1/90b3667d-3ccc-4241-9227-fee578d41bac is absent',
-     ]),
-    ('test_data/publication_tree_bad_1.zip', ['vignettes not found or not a directory']),
-    ('test_data/publication_tree_bad_2.zip', ['Found a subdirectory in a vignette']),
-    ('test_data/publication_tree_bad_3.zip', ['A vignette has more than one markdown file']),
-    ('test_data/publication_tree_bad_4.zip', ['figure dict does not provide a name']),
-    ('test_data/publication_tree_bad_5.zip', ['figure dict does not reference a file']),
-    ('test_data/publication_tree_bad_6.zip', ['unexpected files in vignette.*']),
-    ('test_data/publication_tree_bad_7.zip', ['expected data file'
-                                              ' data/codeluppi_2018_nature_methods.molecules.h5ad.zarr'
-                                              ' is absent']),
-    ))
+
+@pytest.mark.parametrize(
+    ("test_data_fname", "msg_re_list"),
+    (
+        ("test_data/publication_tree_good.zip", []),
+        ("test_data/publication_tree_good_complex.zip", []),
+        (
+            "test_data/publication_tree_bad_complex.zip",
+            [
+                "expected data file data/vignette_12/A/0/325b936e-4132-45fe-8674-9abbde568be8 is absent",  # noqa: E501
+                "expected data file data/vignette_12/A/0/9db02302-07d9-4c54-ad45-4578c4822cce is absent",  # noqa: E501
+                "expected data file data/vignette_12/A/1/90b3667d-3ccc-4241-9227-fee578d41bac is absent",  # noqa: E501
+            ],
+        ),
+        ("test_data/publication_tree_bad_1.zip", ["vignettes not found or not a directory"]),
+        ("test_data/publication_tree_bad_2.zip", ["Found a subdirectory in a vignette"]),
+        ("test_data/publication_tree_bad_3.zip", ["A vignette has more than one markdown file"]),
+        ("test_data/publication_tree_bad_4.zip", ["figure dict does not provide a name"]),
+        ("test_data/publication_tree_bad_5.zip", ["figure dict does not reference a file"]),
+        ("test_data/publication_tree_bad_6.zip", ["unexpected files in vignette.*"]),
+        (
+            "test_data/publication_tree_bad_7.zip",
+            [
+                "expected data file"
+                " data/codeluppi_2018_nature_methods.molecules.h5ad.zarr"
+                " is absent"
+            ],
+        ),
+    ),
+)
 def test_publication_validator(test_data_fname, msg_re_list, tmp_path):
     from publication_validator import PublicationValidator
+
     test_data_path = Path(test_data_fname)
     zfile = zipfile.ZipFile(test_data_path)
     zfile.extractall(tmp_path)
-    validator = PublicationValidator(tmp_path / test_data_path.stem, 'Publication')
+    validator = PublicationValidator(tmp_path / test_data_path.stem, "Publication")
     errors = validator.collect_errors(coreuse=4)[:]
     print(f"errors: {errors}")
     matched_err_str_list = []
diff --git a/tests/test_tiff_validator.py b/tests/test_tiff_validator.py
index c2d271b..4ebe66c 100644
--- a/tests/test_tiff_validator.py
+++ b/tests/test_tiff_validator.py
@@ -1,24 +1,32 @@
-from pathlib import Path
-import zipfile
 import re
+import zipfile
+from pathlib import Path
 
 import pytest
 
-@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), (
-    ('test_data/tiff_tree_good.zip', []),
-    ('test_data/tiff_tree_bad.zip', [
-        '.*notatiff.tif is not a valid TIFF file',
-        '.*notatiff.tiff is not a valid TIFF file',
-        '.*notatiff.TIFF is not a valid TIFF file',
-        '.*notatiff.TIF is not a valid TIFF file',
-    ]),
-    ))
+
+@pytest.mark.parametrize(
+    ("test_data_fname", "msg_re_list"),
+    (
+        ("test_data/tiff_tree_good.zip", []),
+        (
+            "test_data/tiff_tree_bad.zip",
+            [
+                ".*notatiff.tif is not a valid TIFF file",
+                ".*notatiff.tiff is not a valid TIFF file",
+                ".*notatiff.TIFF is not a valid TIFF file",
+                ".*notatiff.TIF is not a valid TIFF file",
+            ],
+        ),
+    ),
+)
 def test_tiff_validator(test_data_fname, msg_re_list, tmp_path):
     from tiff_validator import TiffValidator
+
     test_data_path = Path(test_data_fname)
     zfile = zipfile.ZipFile(test_data_path)
     zfile.extractall(tmp_path)
-    validator = TiffValidator(tmp_path / test_data_path.stem, 'codex')
+    validator = TiffValidator(tmp_path / test_data_path.stem, "codex")
     errors = validator.collect_errors(coreuse=4)[:]
     print(f"errors: {errors}")
     matched_err_str_list = []

From 174aae05eca907b1f24589040b2a5e300838c704 Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Thu, 1 Feb 2024 12:40:50 -0500
Subject: [PATCH 19/30] testing black gh action

---
 .github/workflows/black.yml | 14 +++++++++++++
 README.md                   | 42 +++++++++++++++++++++++++++++++++++--
 requirements-dev.txt        |  2 +-
 3 files changed, 55 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/black.yml

diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
new file mode 100644
index 0000000..a4a0460
--- /dev/null
+++ b/.github/workflows/black.yml
@@ -0,0 +1,14 @@
+name: Lint
+
+on: [push, pull_request]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: psf/black@stable
+        with:
+          options: "--check --verbose --line-length 99"
+          src: "./src"
+          version: "~= 24.0"
diff --git a/README.md b/README.md
index 2db444a..033f5b4 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,47 @@
 # ingest-validation-tests
 
-This repository contains plug-in tests for use during validation of submissions.  It is referenced by ingest-validation-tools.
+This repository contains plug-in tests for use during validation of submissions. It is referenced by ingest-validation-tools.
 
 ## Development process
 
+### Branches
+
 - Make new feature branches from `devel`.
 - Make PRs to `devel`. (This is the default branch.)
-- The last reviewer to approve a PR should merge it.  At the moment that is likely to be @jswelling .
+- The last reviewer to approve a PR should merge it.
+
+### Setup
+
+- Creating and activating a virtual environment is recommended. These instructions assume you are using a virtual environment. Example using venv:
+
+```
+python3.9 -m venv hm-ingest-validation-tests
+source hm-ingest-validation-tests/bin/activate
+```
+
+- Run `pip install -r requirements-dev.txt`
+- (optional) Integrate black with your editor.
+  - [Instructions for black.](https://black.readthedocs.io/en/stable/integrations/editors.html)
+  - If you choose not to integrate black with your editor, run the following from the base `ingest-validation-tests` directory before pushing code to GitHub: `black --line-length 99 .`
+
+### Testing
+
+- If ingest-validation-tools is not already set up:
+
+```
+# Starting from ingest-validation-tests...
+cd ..
+git clone https://github.com/hubmapconsortium/ingest-validation-tools.git
+cd ingest-validation-tests
+pip install -r ../ingest-validation-tools/requirements.txt
+pip install -r ../ingest-validation-tools/requirements-dev.txt
+```
+
+- If ingest-validation-tools is already set up, add the appropriate ingest-validation-tools path and run
+
+```
+pip install -r <path-to-ingest-validation-tools>/requirements.txt
+pip install -r <path-to-ingest-validation-tools>/requirements-dev.txt
+```
+
+- Run `test.sh`
diff --git a/requirements-dev.txt b/requirements-dev.txt
index d5c087a..2cd9d49 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,10 +1,10 @@
 black==23.12.1
-flake8==7.0.0
 git+https://github.com/hubmapconsortium/fastq-utils.git@v0.2.5#egg=hubmap-fastq-utils
 imagecodecs>=2023.3.16
 isort==5.13.2
 jsonschema==4.4.0
 pandas>=1.2.0
+pytest==8.0.0
 python-frontmatter>=1.0.0
 tifffile==2020.10.1
 xmlschema>=1.6

From 0dd0cb643a841301b7f1acea6e7c1891bfa70af6 Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Thu, 1 Feb 2024 12:53:29 -0500
Subject: [PATCH 20/30] linting change

---
 src/ingest_validation_tests/fastq_validator_logic.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py
index 8095b74..440ceb4 100644
--- a/src/ingest_validation_tests/fastq_validator_logic.py
+++ b/src/ingest_validation_tests/fastq_validator_logic.py
@@ -122,9 +122,9 @@ def _validate_fastq_line_4(self, line: str) -> List[str]:
     def validate_fastq_record(self, line: str, line_number: int) -> List[str]:
         line_index = line_number % 4 + 1
 
-        validator_method: Callable[
-            [FASTQValidatorLogic, str], List[str]
-        ] = self._VALIDATE_FASTQ_LINE_METHODS[line_index]
+        validator_method: Callable[[FASTQValidatorLogic, str], List[str]] = (
+            self._VALIDATE_FASTQ_LINE_METHODS[line_index]
+        )
 
         assert validator_method, f"No validator method defined for record index {line_index}"
 

From 303e87d8179cceb5b310d0faa64f76726b213d69 Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Thu, 1 Feb 2024 13:01:40 -0500
Subject: [PATCH 21/30] testing update to dismiss error about node version

---
 .github/workflows/black.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
index a4a0460..d8d7e0b 100644
--- a/.github/workflows/black.yml
+++ b/.github/workflows/black.yml
@@ -6,7 +6,7 @@ jobs:
   lint:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: psf/black@stable
         with:
           options: "--check --verbose --line-length 99"

From edac78ced52c81068baa4504558d7bdd9e5eeeed Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Thu, 1 Feb 2024 13:06:05 -0500
Subject: [PATCH 22/30] trying isort gh action

---
 .github/workflows/isort.yml | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 .github/workflows/isort.yml

diff --git a/.github/workflows/isort.yml b/.github/workflows/isort.yml
new file mode 100644
index 0000000..7bfa84f
--- /dev/null
+++ b/.github/workflows/isort.yml
@@ -0,0 +1,13 @@
+name: Run isort
+on:
+  - push
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: isort/isort-action@v1
+        with:
+          configuration: "--profile black"
+          requirements-files: "requirements.txt requirements-dev.txt"

From 9d368d12c070eddb93dc35d96786b4a7740edd3f Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Thu, 1 Feb 2024 13:49:30 -0500
Subject: [PATCH 23/30] testing isort 3

---
 .github/workflows/isort.yml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/isort.yml b/.github/workflows/isort.yml
index 7bfa84f..fdf5e4a 100644
--- a/.github/workflows/isort.yml
+++ b/.github/workflows/isort.yml
@@ -7,7 +7,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-      - uses: isort/isort-action@v1
+      - uses: actions/setup-python@v5
         with:
-          configuration: "--profile black"
-          requirements-files: "requirements.txt requirements-dev.txt"
+          python-version: 3.9
+      - uses: isort/isort-action@master
+        with:
+            configuration: "--profile black"
+            requirementsFiles: "requirements.txt requirements-test.txt"

From 2f6f1d9cbc2070e63565625537acf7f013b97f2c Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Thu, 1 Feb 2024 13:51:03 -0500
Subject: [PATCH 24/30] testing isort 4

---
 .github/workflows/isort.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/isort.yml b/.github/workflows/isort.yml
index fdf5e4a..52f9cf5 100644
--- a/.github/workflows/isort.yml
+++ b/.github/workflows/isort.yml
@@ -12,5 +12,5 @@ jobs:
           python-version: 3.9
       - uses: isort/isort-action@master
         with:
-            configuration: "--profile black"
+            configuration: "--check-only --diff --profile black"
             requirementsFiles: "requirements.txt requirements-test.txt"

From 4f9c09fb12e3cd645a5aa8c925c0106bcacdcdf4 Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Thu, 1 Feb 2024 14:10:23 -0500
Subject: [PATCH 25/30] updated actions to run on PR; updated README.md

---
 .github/workflows/black.yml | 3 +--
 .github/workflows/isort.yml | 3 +--
 README.md                   | 5 ++++-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
index d8d7e0b..7c02822 100644
--- a/.github/workflows/black.yml
+++ b/.github/workflows/black.yml
@@ -1,6 +1,5 @@
 name: Lint
-
-on: [push, pull_request]
+on: pull_request
 
 jobs:
   lint:
diff --git a/.github/workflows/isort.yml b/.github/workflows/isort.yml
index 52f9cf5..db105a8 100644
--- a/.github/workflows/isort.yml
+++ b/.github/workflows/isort.yml
@@ -1,6 +1,5 @@
 name: Run isort
-on:
-  - push
+on: pull_request
 
 jobs:
   build:
diff --git a/README.md b/README.md
index 033f5b4..4bddaf1 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,10 @@ This repository contains plug-in tests for use during validation of submissions.
 ### Branches
 
 - Make new feature branches from `devel`.
+- Before submitting a PR, make sure your code is black and isort compliant. Run the following from the base `ingest-validation-tests` directory:
+  - `black --line-length 99 .` (if you choose not to integrate black with your editor (see Setup section)
+  - `isort .`
+
 - Make PRs to `devel`. (This is the default branch.)
 - The last reviewer to approve a PR should merge it.
 
@@ -22,7 +26,6 @@ source hm-ingest-validation-tests/bin/activate
 - Run `pip install -r requirements-dev.txt`
 - (optional) Integrate black with your editor.
   - [Instructions for black.](https://black.readthedocs.io/en/stable/integrations/editors.html)
-  - If you choose not to integrate black with your editor, run the following from the base `ingest-validation-tests` directory before pushing code to GitHub: `black --line-length 99 .`
 
 ### Testing
 

From a029b0061d251cba10393e0be7843e2a86fc9a00 Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Thu, 1 Feb 2024 14:35:56 -0500
Subject: [PATCH 26/30] cleanup

---
 .flake8    |  3 ---
 .gitignore |  1 +
 README.md  | 48 +++++++++++++++++++++++++++---------------------
 3 files changed, 28 insertions(+), 24 deletions(-)
 delete mode 100644 .flake8

diff --git a/.flake8 b/.flake8
deleted file mode 100644
index ad98539..0000000
--- a/.flake8
+++ /dev/null
@@ -1,3 +0,0 @@
-[flake8]
-max-line-length = 100
-extend-ignore = E203
diff --git a/.gitignore b/.gitignore
index da9fde3..79cb0cf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 .vscode/
+.flake8
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/README.md b/README.md
index 4bddaf1..b0f37ab 100644
--- a/README.md
+++ b/README.md
@@ -8,8 +8,13 @@ This repository contains plug-in tests for use during validation of submissions.
 
 - Make new feature branches from `devel`.
 - Before submitting a PR, make sure your code is black and isort compliant. Run the following from the base `ingest-validation-tests` directory:
-  - `black --line-length 99 .` (if you choose not to integrate black with your editor (see Setup section)
-  - `isort .`
+
+  ```
+  black --line-length 99 .
+  isort --profile black --multi-line 3 .
+  ```
+
+  (You can integrate black and potentially isort with your editor to skip this step, see Setup section below)
 
 - Make PRs to `devel`. (This is the default branch.)
 - The last reviewer to approve a PR should merge it.
@@ -18,33 +23,34 @@ This repository contains plug-in tests for use during validation of submissions.
 
 - Creating and activating a virtual environment is recommended. These instructions assume you are using a virtual environment. Example using venv:
 
-```
-python3.9 -m venv hm-ingest-validation-tests
-source hm-ingest-validation-tests/bin/activate
-```
+  ```
+  python3.9 -m venv hm-ingest-validation-tests
+  source hm-ingest-validation-tests/bin/activate
+  ```
 
 - Run `pip install -r requirements-dev.txt`
 - (optional) Integrate black with your editor.
   - [Instructions for black.](https://black.readthedocs.io/en/stable/integrations/editors.html)
+- (optional) Integrate [isort](https://pycqa.github.io/isort/) with your editor.
 
 ### Testing
 
 - If ingest-validation-tools is not already set up:
 
-```
-# Starting from ingest-validation-tests...
-cd ..
-git clone https://github.com/hubmapconsortium/ingest-validation-tools.git
-cd ingest-validation-tests
-pip install -r ../ingest-validation-tools/requirements.txt
-pip install -r ../ingest-validation-tools/requirements-dev.txt
-```
-
-- If ingest-validation-tools is already set up, add the appropriate ingest-validation-tools path and run
-
-```
-pip install -r <path-to-ingest-validation-tools>/requirements.txt
-pip install -r <path-to-ingest-validation-tools>/requirements-dev.txt
-```
+  ```
+  # Starting from ingest-validation-tests...
+  cd ..
+  git clone https://github.com/hubmapconsortium/ingest-validation-tools.git
+  cd ingest-validation-tests
+  pip install -r ../ingest-validation-tools/requirements.txt
+  pip install -r ../ingest-validation-tools/requirements-dev.txt
+  ```
+
+- If ingest-validation-tools is already set up, add the appropriate ingest-validation-tools path and run:
+
+  ```
+  pip install -r <path-to-ingest-validation-tools>/requirements.txt
+  pip install -r <path-to-ingest-validation-tools>/requirements-dev.txt
+  ```
 
 - Run `test.sh`

From 9604ce7a38b7a20ebfa37c93c95bffbdca0ed045 Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Thu, 1 Feb 2024 14:42:24 -0500
Subject: [PATCH 27/30] cleanup

---
 .github/workflows/isort.yml | 4 ++--
 pyproject.toml              | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/isort.yml b/.github/workflows/isort.yml
index db105a8..2ef4557 100644
--- a/.github/workflows/isort.yml
+++ b/.github/workflows/isort.yml
@@ -11,5 +11,5 @@ jobs:
           python-version: 3.9
       - uses: isort/isort-action@master
         with:
-            configuration: "--check-only --diff --profile black"
-            requirementsFiles: "requirements.txt requirements-test.txt"
+          configuration: "--check-only --diff --profile black"
+          requirementsFiles: "requirements.txt requirements-test.txt"
diff --git a/pyproject.toml b/pyproject.toml
index 72dd8d9..93863fa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,4 +4,3 @@ line-length = 99
 [tool.isort]
 profile = "black"
 multi_line_output = 3
-src_paths = ["src/ingest-validation-tests"]

From e30f4263b8b9c44c98262fc2f8fce199b209b33c Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Thu, 1 Feb 2024 14:45:11 -0500
Subject: [PATCH 28/30] oops forgot to update test file

---
 tests/test_fastq_validator_logic.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_fastq_validator_logic.py b/tests/test_fastq_validator_logic.py
index c554d6f..31bd68a 100644
--- a/tests/test_fastq_validator_logic.py
+++ b/tests/test_fastq_validator_logic.py
@@ -3,6 +3,7 @@
 from typing import TextIO
 
 import pytest
+
 from src.ingest_validation_tests.fastq_validator_logic import FASTQValidatorLogic
 
 _GOOD_RECORDS = """\

From 99858a1dfca86648c83b1f4b9241bd11d59c8b5d Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Fri, 2 Feb 2024 13:46:17 -0500
Subject: [PATCH 29/30] simplifying workflows yml file, adding flake8

---
 .flake8                       |  3 +++
 .github/workflows/black.yml   | 13 -------------
 .github/workflows/isort.yml   | 15 ---------------
 .github/workflows/linting.yml | 32 ++++++++++++++++++++++++++++++++
 .gitignore                    |  1 -
 README.md                     |  6 ++++--
 requirements-dev.txt          |  1 +
 7 files changed, 40 insertions(+), 31 deletions(-)
 create mode 100644 .flake8
 delete mode 100644 .github/workflows/black.yml
 delete mode 100644 .github/workflows/isort.yml
 create mode 100644 .github/workflows/linting.yml

diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..70fc30a
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,3 @@
+[flake8]
+extend-ignore = E203,E501,W503
+max-line-length = 99
diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
deleted file mode 100644
index 7c02822..0000000
--- a/.github/workflows/black.yml
+++ /dev/null
@@ -1,13 +0,0 @@
-name: Lint
-on: pull_request
-
-jobs:
-  lint:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: psf/black@stable
-        with:
-          options: "--check --verbose --line-length 99"
-          src: "./src"
-          version: "~= 24.0"
diff --git a/.github/workflows/isort.yml b/.github/workflows/isort.yml
deleted file mode 100644
index 2ef4557..0000000
--- a/.github/workflows/isort.yml
+++ /dev/null
@@ -1,15 +0,0 @@
-name: Run isort
-on: pull_request
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
-        with:
-          python-version: 3.9
-      - uses: isort/isort-action@master
-        with:
-          configuration: "--check-only --diff --profile black"
-          requirementsFiles: "requirements.txt requirements-test.txt"
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
new file mode 100644
index 0000000..f34ebe8
--- /dev/null
+++ b/.github/workflows/linting.yml
@@ -0,0 +1,32 @@
+name: Linters
+
+on:
+  push:
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.9", "3.10"]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install flake8 black isort
+      - name: Flake8 Lint
+        run: |
+          flake8 --ignore=E501,W503,E203
+      - name: Black Lint
+        run: |
+          black --line-length 99 --check --verbose
+      - name: isort Lint
+        run: |
+          isort --profile black --check-only --diff
diff --git a/.gitignore b/.gitignore
index 79cb0cf..da9fde3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,4 @@
 .vscode/
-.flake8
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/README.md b/README.md
index b0f37ab..0cca7ba 100644
--- a/README.md
+++ b/README.md
@@ -7,14 +7,15 @@ This repository contains plug-in tests for use during validation of submissions.
 ### Branches
 
 - Make new feature branches from `devel`.
-- Before submitting a PR, make sure your code is black and isort compliant. Run the following from the base `ingest-validation-tests` directory:
+- Before submitting a PR, make sure your code is black, isort, and flake8 compliant. Run the following from the base `ingest-validation-tests` directory:
 
   ```
   black --line-length 99 .
   isort --profile black --multi-line 3 .
+  flake8
   ```
 
-  (You can integrate black and potentially isort with your editor to skip this step, see Setup section below)
+  (Integrating black and potentially isort/flake8 with your editor may allow you to skip this step, see Setup section below.)
 
 - Make PRs to `devel`. (This is the default branch.)
 - The last reviewer to approve a PR should merge it.
@@ -32,6 +33,7 @@ This repository contains plug-in tests for use during validation of submissions.
 - (optional) Integrate black with your editor.
   - [Instructions for black.](https://black.readthedocs.io/en/stable/integrations/editors.html)
 - (optional) Integrate [isort](https://pycqa.github.io/isort/) with your editor.
+- (optional) Integrate [flake8](https://flake8.pycqa.org/en/latest/index.html) with your editor.
 
 ### Testing
 
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 2cd9d49..ba6f71b 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,4 +1,5 @@
 black==23.12.1
+flake8==7.0.0
 git+https://github.com/hubmapconsortium/fastq-utils.git@v0.2.5#egg=hubmap-fastq-utils
 imagecodecs>=2023.3.16
 isort==5.13.2

From 67436d4e59399b25fce5d76d159edc85a42b9ca2 Mon Sep 17 00:00:00 2001
From: Gesina Phillips <gesina@psc.edu>
Date: Fri, 2 Feb 2024 13:47:38 -0500
Subject: [PATCH 30/30] fixing mistake

---
 .github/workflows/linting.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
index f34ebe8..b4fb9d4 100644
--- a/.github/workflows/linting.yml
+++ b/.github/workflows/linting.yml
@@ -23,10 +23,10 @@ jobs:
           pip install flake8 black isort
       - name: Flake8 Lint
         run: |
-          flake8 --ignore=E501,W503,E203
+          flake8 --ignore=E501,W503,E203 .
       - name: Black Lint
         run: |
-          black --line-length 99 --check --verbose
+          black --line-length 99 --check --verbose .
       - name: isort Lint
         run: |
-          isort --profile black --check-only --diff
+          isort --profile black --check-only --diff .