From 12fa4a4b690868cfeff27279959b7216b75b3e34 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Fri, 19 Jan 2024 13:41:35 -0500 Subject: [PATCH 01/30] testing initial refactor --- .../codex_common_errors_validator.py | 297 +++++++++--------- .../codex_json_validator.py | 20 +- .../fastq_validator.py | 7 +- .../fastq_validator_logic.py | 41 ++- src/ingest_validation_tests/gz_validator.py | 53 ++-- .../ome_tiff_validator.py | 64 +++- .../publication_validator.py | 73 ++--- src/ingest_validation_tests/tiff_validator.py | 68 +++- 8 files changed, 362 insertions(+), 261 deletions(-) diff --git a/src/ingest_validation_tests/codex_common_errors_validator.py b/src/ingest_validation_tests/codex_common_errors_validator.py index 01bda98..e98c3c6 100644 --- a/src/ingest_validation_tests/codex_common_errors_validator.py +++ b/src/ingest_validation_tests/codex_common_errors_validator.py @@ -12,6 +12,7 @@ class QuitNowException(Exception): """ Signal exit from this validation test """ + pass @@ -43,166 +44,178 @@ class CodexCommonErrorsValidator(Validator): Test for some common errors in the directory and file structure of CODEX datasets. """ + description = "Test for common problems found in CODEX" cost = 1.0 + def collect_errors(self, **kwargs) -> List[str]: """ Return the errors found by this validator """ + del kwargs if self.assay_type != 'CODEX': return [] # We only test CODEX data - rslt = [] - try: - # is the raw/src_ directory present? - prefix = None - if (self.path / 'raw').is_dir(): - prefix = self.path / 'raw' - else: - for candidate in self.path.glob('src_*'): - prefix = candidate - if prefix is None: - rslt.append('The raw/src_ subdirectory is missing?') - raise QuitNowException() - - # Does dataset.json exist? If so, 'new CODEX' syntax rules - # are in effect - dataset_json_exists = False - any_dataset_json_exists = False - for candidate in self.path.glob('**/dataset.json'): - any_dataset_json_exists = True - if candidate == prefix / 'dataset.json': - dataset_json_exists = True - if dataset_json_exists: - print('FOUND dataset.json; skipping further analysis') - raise QuitNowException() - elif any_dataset_json_exists: - rslt.append('A dataset.json file exists but' - ' is in the wrong place') - - # is the segmentation.json file on the right side? - found = False - right_place = False - for path in self.path.glob('*/[Ss]egmentation.json'): - rel_path = path.relative_to(self.path) - found = True - if str(rel_path).startswith(('raw', 'src_')): - right_place = True - if found: - if right_place: - pass + rslts = [] + for path in self.paths: + rslt = [] + try: + # is the raw/src_ directory present? + prefix = None + if (path / 'raw').is_dir(): + prefix = path / 'raw' else: - rslt.append('The segmentation.json file is in the wrong subdirectory') - else: - rslt.append('The segmentation.json file is missing or misplaced') - - # Does the channelnames.txt file exist? - channelnames_txt_path = prefix / 'channelnames.txt' - if not channelnames_txt_path.is_file(): - # sometimes we see this variant - channelnames_txt_path = prefix / 'channelNames.txt' - if not channelnames_txt_path.is_file(): - rslt.append('channelnames.txt is missing') + for candidate in path.glob('src_*'): + prefix = candidate + if prefix is None: + rslt.append('The raw/src_ subdirectory is missing?') raise QuitNowException() - # Parse channelnames.txt into a dataframe - try: - cn_df = pd.read_csv(str(channelnames_txt_path), header=None) - except Exception: - rslt.append(f'Unexpected error reading {channelnames_txt_path}') - raise QuitNowException() - if len(cn_df.columns) != 1: - rslt.append(f'Unexpected format for {channelnames_txt_path}') - raise QuitNowException() - - # Does the channelnames_report.csv file exist? - report_csv_path = prefix / 'channelnames_report.csv' - if report_csv_path.is_file(): - # Parse channelnames_report.txt into a dataframe + # Does dataset.json exist? If so, 'new CODEX' syntax rules + # are in effect + dataset_json_exists = False + any_dataset_json_exists = False + for candidate in path.glob('**/dataset.json'): + any_dataset_json_exists = True + if candidate == prefix / 'dataset.json': + dataset_json_exists = True + if dataset_json_exists: + print('FOUND dataset.json; skipping further analysis') + raise QuitNowException() + elif any_dataset_json_exists: + rslt.append( + 'A dataset.json file exists but' + ' is in the wrong place' + ) + + # is the segmentation.json file on the right side? + found = False + right_place = False + for path in path.glob('*/[Ss]egmentation.json'): + rel_path = path.relative_to(path) + found = True + if str(rel_path).startswith(('raw', 'src_')): + right_place = True + if found: + if right_place: + pass + else: + rslt.append( + 'The segmentation.json file is in the wrong subdirectory' + ) + else: + rslt.append('The segmentation.json file is missing or misplaced') + + # Does the channelnames.txt file exist? + channelnames_txt_path = prefix / 'channelnames.txt' + if not channelnames_txt_path.is_file(): + # sometimes we see this variant + channelnames_txt_path = prefix / 'channelNames.txt' + if not channelnames_txt_path.is_file(): + rslt.append('channelnames.txt is missing') + raise QuitNowException() + + # Parse channelnames.txt into a dataframe try: - rpt_df = pd.read_csv(str(report_csv_path), sep=',', header=None) + cn_df = pd.read_csv(str(channelnames_txt_path), header=None) except Exception: - rslt.append(f'Unexpected error reading {report_csv_path}') + rslt.append(f'Unexpected error reading {channelnames_txt_path}') raise QuitNowException() - if len(rpt_df) == len(cn_df) + 1: - # channelnames_report.csv appears to have a header + if len(cn_df.columns) != 1: + rslt.append(f'Unexpected format for {channelnames_txt_path}') + raise QuitNowException() + + # Does the channelnames_report.csv file exist? + report_csv_path = prefix / 'channelnames_report.csv' + if report_csv_path.is_file(): + # Parse channelnames_report.txt into a dataframe try: - rpt_df = pd.read_csv(str(report_csv_path), sep=',') + rpt_df = pd.read_csv(str(report_csv_path), sep=',', header=None) except Exception: rslt.append(f'Unexpected error reading {report_csv_path}') raise QuitNowException() - if len(rpt_df.columns) != 2: - rslt.append(f'Could not parse {report_csv_path}.' - ' Is it a comma-separated table?' - ) - raise QuitNowException() - col_0, col_1 = rpt_df.columns - rpt_df = rpt_df.rename(columns={col_0:'Marker', col_1:'Result'}) - # Do they match? - rpt_df['other'] = cn_df[0] - mismatches_df = rpt_df[rpt_df['other'] != rpt_df['Marker']] - if len(mismatches_df) != 0: - for idx, row in mismatches_df.iterrows(): + if len(rpt_df) == len(cn_df) + 1: + # channelnames_report.csv appears to have a header + try: + rpt_df = pd.read_csv(str(report_csv_path), sep=',') + except Exception: + rslt.append(f'Unexpected error reading {report_csv_path}') + raise QuitNowException() + if len(rpt_df.columns) != 2: rslt.append( - f"{channelnames_txt_path.name} does not" - " match channelnames_report.txt" - f" on line {idx}: {row['other']} vs {row['Marker']}" + f'Could not parse {report_csv_path}.' + ' Is it a comma-separated table?' ) + raise QuitNowException() + col_0, col_1 = rpt_df.columns + rpt_df = rpt_df.rename(columns={col_0: 'Marker', col_1: 'Result'}) + # Do they match? + rpt_df['other'] = cn_df[0] + mismatches_df = rpt_df[rpt_df['other'] != rpt_df['Marker']] + if len(mismatches_df) != 0: + for idx, row in mismatches_df.iterrows(): + rslt.append( + f'{channelnames_txt_path.name} does not' + ' match channelnames_report.txt' + f' on line {idx}: {row["other"]} vs {row["Marker"]}' + ) + raise QuitNowException() + else: + rpt_df = None + + # Tabulate the cycle and region info + all_cycle_dirs = [] + for glob_str in ['cyc*', 'Cyc*']: + for pth in prefix.glob(glob_str): + if pth.is_dir(): + all_cycle_dirs.append(str(pth.stem).lower()) + cycles = [] + regions = [] + failures = [] + for cyc_dir in all_cycle_dirs: + try: + cyc_id, reg_id = _split_cycle_dir_string(cyc_dir) + cycles.append(cyc_id) + regions.append(reg_id) + except AssertionError as excp: + failures.append(str(excp)) + if failures: + rslt += failures raise QuitNowException() - else: - rpt_df = None - - # Tabulate the cycle and region info - all_cycle_dirs = [] - for glob_str in ['cyc*', 'Cyc*']: - for pth in prefix.glob(glob_str): - if pth.is_dir(): - all_cycle_dirs.append(str(pth.stem).lower()) - cycles = [] - regions = [] - failures = [] - for cyc_dir in all_cycle_dirs: - try: - cyc_id, reg_id = _split_cycle_dir_string(cyc_dir) - cycles.append(cyc_id) - regions.append(reg_id) - except AssertionError as excp: - failures.append(str(excp)) - if failures: - rslt += failures - raise QuitNowException() - total_entries = len(cycles) - cycles = list(set(cycles)) - cycles.sort() - regions = list(set(regions)) - regions.sort() - failures = [] - # First cycle must be 1 - if cycles[0] != 1: - failures.append('Cycle numbering does not start at 1') - # First region must be 1 - if regions[0] != 1: - failures.append('Region numbering does not start at 1') - # Cycle range must be contiguous ints - if cycles != list(range(cycles[0], cycles[-1]+1)): - failures.append('Cycle numbers are not contiguous') - # Region range must be contiguous ints - if regions != list(range(regions[0], regions[-1]+1)): - failures.append('Region numbers are not contiguous') - # All cycle, region pairs must be present - if len(cycles) * len(regions) != total_entries: - failures.append('Not all cycle/region pairs are present') - # Total number of channels / total number of cycles must be integer, - # excluding any HandE channels - total_channel_count = len(cn_df) - h_and_e_channel_count = len(cn_df[cn_df[0].str.startswith('HandE')]) - channels_per_cycle = ((total_channel_count - h_and_e_channel_count) - / len(cycles)) - if channels_per_cycle != int(channels_per_cycle): - failures.append('The number of channels per cycle is not constant') - if failures: - rslt += failures - raise QuitNowException() - - except QuitNowException: - pass - return rslt + total_entries = len(cycles) + cycles = list(set(cycles)) + cycles.sort() + regions = list(set(regions)) + regions.sort() + failures = [] + # First cycle must be 1 + if cycles[0] != 1: + failures.append('Cycle numbering does not start at 1') + # First region must be 1 + if regions[0] != 1: + failures.append('Region numbering does not start at 1') + # Cycle range must be contiguous ints + if cycles != list(range(cycles[0], cycles[-1] + 1)): + failures.append('Cycle numbers are not contiguous') + # Region range must be contiguous ints + if regions != list(range(regions[0], regions[-1] + 1)): + failures.append('Region numbers are not contiguous') + # All cycle, region pairs must be present + if len(cycles) * len(regions) != total_entries: + failures.append('Not all cycle/region pairs are present') + # Total number of channels / total number of cycles must be integer, + # excluding any HandE channels + total_channel_count = len(cn_df) + h_and_e_channel_count = len(cn_df[cn_df[0].str.startswith('HandE')]) + channels_per_cycle = ( + total_channel_count - h_and_e_channel_count + ) / len(cycles) + if channels_per_cycle != int(channels_per_cycle): + failures.append('The number of channels per cycle is not constant') + if failures: + rslt += failures + raise QuitNowException() + rslts.append(rslt) + + except QuitNowException: + pass + return rslts diff --git a/src/ingest_validation_tests/codex_json_validator.py b/src/ingest_validation_tests/codex_json_validator.py index 17b4ee9..5700cfe 100644 --- a/src/ingest_validation_tests/codex_json_validator.py +++ b/src/ingest_validation_tests/codex_json_validator.py @@ -1,10 +1,9 @@ -from typing import List -from pathlib import Path import json - -from jsonschema import validate +from pathlib import Path +from typing import List from ingest_validation_tools.plugin_validator import Validator +from jsonschema import validate class CodexJsonValidator(Validator): @@ -20,10 +19,11 @@ def collect_errors(self, **kwargs) -> List[str]: rslt = [] for glob_expr in ['**/dataset.json']: - for path in self.path.glob(glob_expr): - instance = json.loads(path.read_text()) - try: - validate(instance=instance, schema=schema) - except Exception as e: - rslt.append(f'{path}: {e}') + for path in self.paths: + for file in path.glob(glob_expr): + instance = json.loads(file.read_text()) + try: + validate(instance=instance, schema=schema) + except Exception as e: + rslt.append(f'{file}: {e}') return rslt diff --git a/src/ingest_validation_tests/fastq_validator.py b/src/ingest_validation_tests/fastq_validator.py index 42101e3..dca270b 100644 --- a/src/ingest_validation_tests/fastq_validator.py +++ b/src/ingest_validation_tests/fastq_validator.py @@ -1,8 +1,8 @@ from os import cpu_count from typing import List -from ingest_validation_tools.plugin_validator import Validator from fastq_validator_logic import FASTQValidatorLogic, _log +from ingest_validation_tools.plugin_validator import Validator class FASTQValidator(Validator): @@ -10,8 +10,7 @@ class FASTQValidator(Validator): cost = 15.0 def collect_errors(self, **kwargs) -> List[str]: - threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1 - _log(f'Threading at {threads}') + _log(f'Threading at {self.threads}') validator = FASTQValidatorLogic(verbose=True) - validator.validate_fastq_files_in_path(self.path, threads) + validator.validate_fastq_files_in_path(self.paths, self.pool) return validator.errors diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index 4043718..bb247ff 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -1,16 +1,16 @@ import argparse import gzip import re - +from multiprocessing import Manager, pool +from os import cpu_count from pathlib import Path from typing import Callable, List, TextIO -from multiprocessing import Pool, Manager, Lock import fastq_utils def is_valid_filename(filename: str) -> bool: - return fastq_utils.FASTQ_PATTERN.fullmatch(filename) + return bool(fastq_utils.FASTQ_PATTERN.fullmatch(filename)) def _open_fastq_file(file: Path) -> TextIO: @@ -26,14 +26,16 @@ def _log(message: str) -> str: class Engine(object): - def __init__(self, validate_object, path: Path, lock): + def __init__(self, validate_object, paths: List[Path], lock): self.validate_object = validate_object - self.path = path + self.paths = paths self.lock = lock def __call__(self, fastq_file): - self.validate_object.validate_fastq_file(self.path / fastq_file, self.lock) - return next(iter(self.validate_object.errors), None) + for path in self.paths: + _log(f"Validating matching fastq files in {path.as_posix()}") + self.validate_object.validate_fastq_file(path / fastq_file, self.lock) + return next(iter(self.validate_object.errors), None) class FASTQValidatorLogic: @@ -126,7 +128,7 @@ def _validate_fastq_line_4(self, line: str) -> List[str]: def validate_fastq_record(self, line: str, line_number: int) -> List[str]: line_index = line_number % 4 + 1 - validator_method: Callable[[FASTQValidatorLogic, str], List] = \ + validator_method: Callable[[FASTQValidatorLogic, str], List[str]] = \ self._VALIDATE_FASTQ_LINE_METHODS[line_index] assert validator_method, \ @@ -202,17 +204,16 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None: else: self._file_prefix_counts[filename_prefix] = records_read - def validate_fastq_files_in_path(self, path: Path, threads: int) -> None: + def validate_fastq_files_in_path(self, paths: List[Path], pool: pool.Pool) -> None: data_found_one = [] - _log(f"Validating matching fastq files in {path.as_posix()}") - - dirs_and_files = fastq_utils.collect_fastq_files_by_directory(path) + dirs_and_files = {} + for path in paths: + dirs_and_files.update(fastq_utils.collect_fastq_files_by_directory(path)) with Manager() as manager: lock = manager.Lock() - for directory, file_list in dirs_and_files.items(): + for _, file_list in dirs_and_files.items(): try: - pool = Pool(threads) - engine = Engine(self, path, lock) + engine = Engine(self, paths, lock) data_output = pool.imap_unordered(engine, file_list) except Exception as e: _log(f'Error {e}') @@ -233,13 +234,9 @@ def main(): args = parser.parse_args() validator = FASTQValidatorLogic(True) - - path: Path - for path in args.filepaths: - if path.is_dir(): - validator.validate_fastq_files_in_path(path, 1) - else: - validator.validate_fastq_file(path, Lock()) + threads = cpu_count() // 4 or 1 + pool = Pool(threads) + validator.validate_fastq_files_in_path(args.filepaths, pool) if __name__ == '__main__': diff --git a/src/ingest_validation_tests/gz_validator.py b/src/ingest_validation_tests/gz_validator.py index cb25bb1..c3cb66f 100644 --- a/src/ingest_validation_tests/gz_validator.py +++ b/src/ingest_validation_tests/gz_validator.py @@ -12,20 +12,22 @@ def _log(message: str): class Engine(object): - def __call__(self, filename): - excluded = r'.*/*fastq.gz' - if re.search(excluded, filename.as_posix()): - return - try: - _log(f'Threaded {filename}') - with gzip.open(filename) as g_f: - while True: - buf = g_f.read(1024*1024) - if not buf: - break - except Exception as e: - _log(f'{filename} is not a valid gzipped file {e}') - return f'{filename} is not a valid gzipped file' + def __call__(self, path_list): + for path in path_list: + for filename in path.glob('**/*.gz'): + excluded = r'.*/*fastq.gz' + if re.search(excluded, filename.as_posix()): + return + try: + _log(f'Threaded {filename}') + with gzip.open(filename) as g_f: + while True: + buf = g_f.read(1024*1024) + if not buf: + break + except Exception as e: + _log(f'{filename} is not a valid gzipped file {e}') + return f'{filename} is not a valid gzipped file' class GZValidator(Validator): @@ -34,17 +36,14 @@ class GZValidator(Validator): def collect_errors(self, **kwargs) -> List[str]: data_output2 = [] - threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1 - _log(f'Threading at {threads}') - for glob_expr in ['**/*.gz']: - try: - pool = Pool(threads) - engine = Engine() - data_output = pool.imap_unordered(engine, self.path.glob(glob_expr)) - except Exception as e: - _log(f'Error {e}') - else: - pool.close() - pool.join() - [data_output2.append(output) for output in data_output if output] + _log(f'Threading at {self.threads}') + try: + engine = Engine() + data_output = self.pool.imap_unordered(engine, self.paths) + except Exception as e: + _log(f'Error {e}') + else: + pool.close() + pool.join() + [data_output2.append(output) for output in data_output if output] return data_output2 diff --git a/src/ingest_validation_tests/ome_tiff_validator.py b/src/ingest_validation_tests/ome_tiff_validator.py index fb05133..063fd7d 100644 --- a/src/ingest_validation_tests/ome_tiff_validator.py +++ b/src/ingest_validation_tests/ome_tiff_validator.py @@ -1,23 +1,65 @@ from typing import List +from pathlib import Path -import xmlschema import tifffile +import xmlschema from ingest_validation_tools.plugin_validator import Validator + +# def _check_ome_tiff_file(paths: str): +# rslt_list = [] +# for path in paths: +# try: +# with tifffile.TiffFile(path) as tf: +# xml_document = xmlschema.XmlDocument(tf.ome_metadata) +# if not xml_document.schema.is_valid(xml_document): +# return f"{path} is not a valid OME.TIFF file" +# except Exception as excp: +# rslt_list.append(f"{path} is not a valid OME.TIFF file: {excp}") +# return rslt_list +# +# def _glob_paths(path: str): +# filenames_to_test = [] +# for glob_expr in ['**/*.ome.tif', '**/*.ome.tiff', '**/*.OME.TIFF', '**/*.OME.TIF']: +# for file in path.glob(glob_expr): +# filenames_to_test.append(file) +# return filenames_to_test +# +# class OmeTiffValidator(Validator): description = "Recursively test all ome-tiff files for validity" cost = 1.0 + def collect_errors(self, **kwargs) -> List[str]: + del kwargs rslt = [] for glob_expr in ['**/*.ome.tif', '**/*.ome.tiff', '**/*.OME.TIFF', '**/*.OME.TIF']: - for path in self.path.glob(glob_expr): - try: - with tifffile.TiffFile(path) as tf: - xml_document = xmlschema.XmlDocument(tf.ome_metadata) - if not xml_document.schema.is_valid(xml_document): - rslt.append(f'{path} is not a valid OME.TIFF file') - except Exception as excp: - rslt.append(f'{path} is not a valid OME.TIFF file: {excp}') + for path in self.paths: + for file in path.glob(glob_expr): + try: + with tifffile.TiffFile(file) as tf: + xml_document = xmlschema.XmlDocument(tf.ome_metadata) + if not xml_document.schema.is_valid(xml_document): + rslt.append(f'{file} is not a valid OME.TIFF file') + except Exception as excp: + rslt.append(f'{file} is not a valid OME.TIFF file: {excp}') return rslt - - + # def collect_errors(self, **kwargs) -> List[str]: + # del kwargs + # filenames_to_test = list( + # file + # for file in self.pool.imap_unordered(_glob_paths, self.paths) + # if file is not None + # ) + # # TODO: this is awful + # return list( + # x + # for y in list( + # rslt + # for rslt in self.pool.imap_unordered( + # _check_ome_tiff_file, filenames_to_test + # ) + # if rslt is not None + # ) + # for x in y + # ) diff --git a/src/ingest_validation_tests/publication_validator.py b/src/ingest_validation_tests/publication_validator.py index 9399f2c..4c7ccc1 100644 --- a/src/ingest_validation_tests/publication_validator.py +++ b/src/ingest_validation_tests/publication_validator.py @@ -19,46 +19,49 @@ class PublicationValidator(Validator): cost = 1.0 base_url_re = r'(\s*\{\{\s*base_url\s*\}\})/(.*)' url_re = r'[Uu][Rr][Ll]' + def collect_errors(self, **kwargs) -> List[str]: """ Return the errors found by this validator """ + del kwargs if self.assay_type != 'Publication': return [] # We only test Publication data rslt = [] - try: - vignette_path = self.path / 'vignettes' - assert vignette_path.is_dir(), 'vignettes not found or not a directory' - for this_vignette_path in vignette_path.glob('*'): - assert this_vignette_path.is_dir(), (f"Found the non-dir {this_vignette_path}" - " in vignettes") - this_vignette_all_paths = set(this_vignette_path.glob('*')) - if not all(pth.is_file() for pth in this_vignette_all_paths): - raise AssertionError('Found a subdirectory in a vignette') - md_found = False - vig_figures = [] - for md_path in this_vignette_path.glob('*.md'): - if md_found: - raise AssertionError('A vignette has more than one markdown file') - else: - md_found = True - vig_fm = frontmatter.loads(md_path.read_text()) - for key in ['name', 'figures']: - assert key in vig_fm.metadata, ('vignette markdown is incorrectly' - f' formatted or has no {key}') - for fig_dict in vig_fm.metadata['figures']: - assert 'file' in fig_dict, 'figure dict does not reference a file' - assert 'name' in fig_dict, 'figure dict does not provide a name' - vig_figures.append(fig_dict['file']) - this_vignette_all_paths.remove(md_path) - for fname in vig_figures: - rslt.extend(self.validate_vitessce_config(this_vignette_path / fname)) - this_vignette_all_paths.remove(this_vignette_path / fname) - assert not this_vignette_all_paths, ('unexpected files in vignette:' - f' {list(str(elt) for elt in this_vignette_all_paths)}') + for path in self.paths: + try: + vignette_path = path / 'vignettes' + assert vignette_path.is_dir(), 'vignettes not found or not a directory' + for this_vignette_path in vignette_path.glob('*'): + assert this_vignette_path.is_dir(), (f"Found the non-dir {this_vignette_path}" + " in vignettes") + this_vignette_all_paths = set(this_vignette_path.glob('*')) + if not all(pth.is_file() for pth in this_vignette_all_paths): + raise AssertionError('Found a subdirectory in a vignette') + md_found = False + vig_figures = [] + for md_path in this_vignette_path.glob('*.md'): + if md_found: + raise AssertionError('A vignette has more than one markdown file') + else: + md_found = True + vig_fm = frontmatter.loads(md_path.read_text()) + for key in ['name', 'figures']: + assert key in vig_fm.metadata, ('vignette markdown is incorrectly' + f' formatted or has no {key}') + for fig_dict in vig_fm.metadata['figures']: + assert 'file' in fig_dict, 'figure dict does not reference a file' + assert 'name' in fig_dict, 'figure dict does not provide a name' + vig_figures.append(fig_dict['file']) + this_vignette_all_paths.remove(md_path) + for fname in vig_figures: + rslt.extend(self.validate_vitessce_config(this_vignette_path / fname, path)) + this_vignette_all_paths.remove(this_vignette_path / fname) + assert not this_vignette_all_paths, ('unexpected files in vignette:' + f' {list(str(elt) for elt in this_vignette_all_paths)}') - except AssertionError as excp: - rslt.append(str(excp)) + except AssertionError as excp: + rslt.append(str(excp)) return rslt @@ -82,16 +85,16 @@ def url_search_iter(self, root): else: raise AssertionError(f"what is {root} of type {type(root)} ?") - def validate_vitessce_config(self, json_path): + def validate_vitessce_config(self, json_path, path): rslt = [] with open(json_path) as f: dct = json.load(f) - for key, val in self.url_search_iter(dct): + for _, val in self.url_search_iter(dct): try: match = re.match(self.base_url_re, val) if match: # it starts with {{ base_url }} extra_url = match.group(2) - data_path = self.path / 'data' / extra_url + data_path = path / 'data' / extra_url assert data_path.exists(), ("expected data file" f" {Path('data') / extra_url} is absent") diff --git a/src/ingest_validation_tests/tiff_validator.py b/src/ingest_validation_tests/tiff_validator.py index fc10da1..2b22ec1 100644 --- a/src/ingest_validation_tests/tiff_validator.py +++ b/src/ingest_validation_tests/tiff_validator.py @@ -1,6 +1,7 @@ -from os import cpu_count from multiprocessing import Pool -from typing import List +from os import cpu_count +from pathlib import Path +from typing import List, Optional import tifffile from ingest_validation_tools.plugin_validator import Validator @@ -8,8 +9,12 @@ # monkey patch tifffile to raise an exception every time a warning # is logged original_log_warning = tifffile.tifffile.log_warning + + def my_log_warning(msg, *args, **kwargs): raise RuntimeError(f"{msg.format(*args, **kwargs)}") + + tifffile.tifffile.log_warning = my_log_warning @@ -28,16 +33,59 @@ def _check_tiff_file(path: str) -> str or None: return f"{path} is not a valid TIFF file: {excp}" +# def _check_tiff_file(paths: List) -> Optional[str]: +# rslt_list = [] +# for path in paths: +# try: +# with tifffile.TiffFile(path) as tfile: +# for page in tfile.pages: +# _ = page.asarray() # force decompression +# return None +# except Exception as excp: +# _log(f"{str(path)} is not a valid TIFF file: {excp}") +# rslt_list.append(f"{path} is not a valid TIFF file: {excp}") +# return rslt_list +# + + +# def _glob_paths(path: str): +# filenames_to_test = [] +# for glob_expr in ["**/*.tif", "**/*.tiff", "**/*.TIFF", "**/*.TIF"]: +# for file in path.glob(glob_expr): +# filenames_to_test.append(file) +# return filenames_to_test + + class TiffValidator(Validator): description = "Recursively test all tiff files that are not ome.tiffs for validity" cost = 1.0 + def collect_errors(self, **kwargs) -> List[str]: - threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1 - pool = Pool(threads) + del kwargs filenames_to_test = [] - for glob_expr in ['**/*.tif', '**/*.tiff', '**/*.TIFF', '**/*.TIF']: - for path in self.path.glob(glob_expr): - filenames_to_test.append(path) - return list(rslt for rslt in pool.imap_unordered(_check_tiff_file, - filenames_to_test) - if rslt is not None) + for glob_expr in ["**/*.tif", "**/*.tiff", "**/*.TIFF", "**/*.TIF"]: + for path in self.paths: + for file in path.glob(glob_expr): + filenames_to_test.append(file) + return list( + rslt + for rslt in self.pool.imap_unordered(_check_tiff_file, filenames_to_test) + if rslt is not None + ) + # filenames_to_test = list( + # file + # for file in self.pool.imap_unordered(_glob_paths, self.paths) + # if file is not None + # ) + # # TODO: this is awful + # return list( + # x + # for y in list( + # rslt + # for rslt in self.pool.imap_unordered( + # _check_tiff_file, filenames_to_test + # ) + # if rslt is not None + # ) + # for x in y + # ) From 51c0d7381328903597d5f5a23209b27fc2d65488 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Mon, 22 Jan 2024 14:39:35 -0500 Subject: [PATCH 02/30] fixing mistake in gz_validator --- src/ingest_validation_tests/gz_validator.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/ingest_validation_tests/gz_validator.py b/src/ingest_validation_tests/gz_validator.py index c3cb66f..c552a7d 100644 --- a/src/ingest_validation_tests/gz_validator.py +++ b/src/ingest_validation_tests/gz_validator.py @@ -1,6 +1,4 @@ -from os import cpu_count import re -from multiprocessing import Pool from typing import List import gzip @@ -43,7 +41,7 @@ def collect_errors(self, **kwargs) -> List[str]: except Exception as e: _log(f'Error {e}') else: - pool.close() - pool.join() + self.pool.close() + self.pool.join() [data_output2.append(output) for output in data_output if output] return data_output2 From f5b16d4381484a367c3f1978481ea6f2c0d9ae3b Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Mon, 22 Jan 2024 15:01:20 -0500 Subject: [PATCH 03/30] fixing some mistakes --- .../fastq_validator_logic.py | 2 +- src/ingest_validation_tests/gz_validator.py | 31 +++++++++---------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index bb247ff..19f0b4e 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -1,7 +1,7 @@ import argparse import gzip import re -from multiprocessing import Manager, pool +from multiprocessing import Manager, Pool, pool from os import cpu_count from pathlib import Path from typing import Callable, List, TextIO diff --git a/src/ingest_validation_tests/gz_validator.py b/src/ingest_validation_tests/gz_validator.py index c552a7d..389ba08 100644 --- a/src/ingest_validation_tests/gz_validator.py +++ b/src/ingest_validation_tests/gz_validator.py @@ -10,22 +10,21 @@ def _log(message: str): class Engine(object): - def __call__(self, path_list): - for path in path_list: - for filename in path.glob('**/*.gz'): - excluded = r'.*/*fastq.gz' - if re.search(excluded, filename.as_posix()): - return - try: - _log(f'Threaded {filename}') - with gzip.open(filename) as g_f: - while True: - buf = g_f.read(1024*1024) - if not buf: - break - except Exception as e: - _log(f'{filename} is not a valid gzipped file {e}') - return f'{filename} is not a valid gzipped file' + def __call__(self, path): + for filename in path.glob('**/*.gz'): + excluded = r'.*/*fastq.gz' + if re.search(excluded, filename.as_posix()): + return + try: + _log(f'Threaded {filename}') + with gzip.open(filename) as g_f: + while True: + buf = g_f.read(1024*1024) + if not buf: + break + except Exception as e: + _log(f'{filename} is not a valid gzipped file {e}') + return f'{filename} is not a valid gzipped file' class GZValidator(Validator): From 8eff29434b077ff37616bacdbe65959c2e61734b Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Mon, 22 Jan 2024 15:49:05 -0500 Subject: [PATCH 04/30] accounting for nested dict structure in fastq --- .../fastq_validator_logic.py | 135 ++++++++++-------- 1 file changed, 77 insertions(+), 58 deletions(-) diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index 19f0b4e..e9569d7 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -1,6 +1,8 @@ import argparse import gzip +import logging import re +from collections import defaultdict from multiprocessing import Manager, Pool, pool from os import cpu_count from pathlib import Path @@ -14,10 +16,7 @@ def is_valid_filename(filename: str) -> bool: def _open_fastq_file(file: Path) -> TextIO: - return ( - gzip.open(file, 'rt') if file.name.endswith('.gz') - else file.open() - ) + return gzip.open(file, "rt") if file.name.endswith(".gz") else file.open() def _log(message: str) -> str: @@ -56,13 +55,13 @@ class FASTQValidatorLogic: # This pattern seeks out the string that includes the lane number (since # that is expected to be present to help anchor the prefix) that comes # before any of _I1, _I2, _R1, or _R2. - _FASTQ_FILE_PREFIX_REGEX = re.compile(r'(.+_L\d+.*)_[IR][12][._]') + _FASTQ_FILE_PREFIX_REGEX = re.compile(r"(.+_L\d+.*)_[IR][12][._]") - _FASTQ_LINE_2_VALID_CHARS = 'ACGNT' + _FASTQ_LINE_2_VALID_CHARS = "ACGNT" def __init__(self, verbose=False): self.errors: List[str] = [] - self._filename = '' + self._filename = "" self._line_number = 0 self._file_record_counts = Manager().dict() self._file_prefix_counts = Manager().dict() @@ -84,7 +83,7 @@ def _format_error(self, error: str) -> str: return message def _validate_fastq_line_1(self, line: str) -> List[str]: - if not line or line[0] != '@': + if not line or line[0] != "@": return ["Line does not begin with '@'."] return [] @@ -93,46 +92,54 @@ def _validate_fastq_line_2(self, line: str) -> List[str]: self._line_2_length = len(line) self._last_line_2_number = self._line_number - invalid_chars = ''.join( - c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS) + invalid_chars = "".join( + c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS + ) if invalid_chars: return [f"Line contains invalid character(s): {invalid_chars}"] return [] def _validate_fastq_line_3(self, line: str) -> List[str]: - if not line or line[0] != '+': + if not line or line[0] != "+": return ["Line does not begin with '+'."] return [] def _validate_fastq_line_4(self, line: str) -> List[str]: errors: List[str] = [] - invalid_chars = ''.join(c for c in line if not 33 <= ord(c) <= 126) + invalid_chars = "".join(c for c in line if not 33 <= ord(c) <= 126) if invalid_chars: - errors.append("Line contains invalid quality character(s): " - f'"{invalid_chars}"') + errors.append( + "Line contains invalid quality character(s): " f'"{invalid_chars}"' + ) if len(line) != self._line_2_length: - errors.append(f"Line contains {len(line)} characters which " - f"does not match line {self._last_line_2_number}'s " - f"{self._line_2_length} characters.") + errors.append( + f"Line contains {len(line)} characters which " + f"does not match line {self._last_line_2_number}'s " + f"{self._line_2_length} characters." + ) return errors - _VALIDATE_FASTQ_LINE_METHODS = {1: _validate_fastq_line_1, - 2: _validate_fastq_line_2, - 3: _validate_fastq_line_3, - 4: _validate_fastq_line_4} + _VALIDATE_FASTQ_LINE_METHODS = { + 1: _validate_fastq_line_1, + 2: _validate_fastq_line_2, + 3: _validate_fastq_line_3, + 4: _validate_fastq_line_4, + } def validate_fastq_record(self, line: str, line_number: int) -> List[str]: line_index = line_number % 4 + 1 - validator_method: Callable[[FASTQValidatorLogic, str], List[str]] = \ - self._VALIDATE_FASTQ_LINE_METHODS[line_index] + validator_method: Callable[ + [FASTQValidatorLogic, str], List[str] + ] = self._VALIDATE_FASTQ_LINE_METHODS[line_index] - assert validator_method, \ - f"No validator method defined for record index {line_index}" + assert ( + validator_method + ), f"No validator method defined for record index {line_index}" return validator_method(self, line) @@ -144,8 +151,8 @@ def validate_fastq_stream(self, fastq_data: TextIO) -> int: for line_count, line in enumerate(fastq_data): self._line_number = line_count + 1 self.errors.extend( - self._format_error(error) for error in - self.validate_fastq_record(line.rstrip(), line_count) + self._format_error(error) + for error in self.validate_fastq_record(line.rstrip(), line_count) ) return line_count + 1 @@ -156,9 +163,11 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None: if not is_valid_filename(fastq_file.name): # If we don't like the filename, don't bother reading the contents. - self.errors.append(_log( - "Filename does not have proper format " - "and will not be processed")) + self.errors.append( + _log( + "Filename does not have proper format " "and will not be processed" + ) + ) return self._line_number = 0 @@ -169,14 +178,16 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None: records_read = self.validate_fastq_stream(fastq_data) # TODO: Add gzip.BadGzipFile when Python 3.8 is available except IOError: - self.errors.append( - self._format_error("Unable to open FASTQ data file.")) + self.errors.append(self._format_error("Unable to open FASTQ data file.")) return if fastq_file.name in self._file_record_counts.keys(): - self.errors.append(_log( - f"{fastq_file.name} has been found multiple times during this " - "validation.")) + self.errors.append( + _log( + f"{fastq_file.name} has been found multiple times during this " + "validation." + ) + ) self._file_record_counts[fastq_file.name] = records_read match = self._FASTQ_FILE_PREFIX_REGEX.match(fastq_file.name) @@ -189,47 +200,55 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None: # Find a file we've validated already that matches this # prefix. extant_files = [ - filename for filename, record_count - in self._file_record_counts.items() - if record_count == extant_count and filename.startswith(filename_prefix) + filename + for filename, record_count in self._file_record_counts.items() + if record_count == extant_count + and filename.startswith(filename_prefix) ] # Based on how the dictionaries are created, there should # always be at least one matching filename. assert extant_files - self.errors.append(_log( - f"{fastq_file.name} ({records_read} lines) " - f"does not match length of {extant_files[0]} " - f"({extant_count} lines).")) + self.errors.append( + _log( + f"{fastq_file.name} ({records_read} lines) " + f"does not match length of {extant_files[0]} " + f"({extant_count} lines)." + ) + ) else: self._file_prefix_counts[filename_prefix] = records_read def validate_fastq_files_in_path(self, paths: List[Path], pool: pool.Pool) -> None: data_found_one = [] - dirs_and_files = {} + dirs_and_files = defaultdict(dict) for path in paths: - dirs_and_files.update(fastq_utils.collect_fastq_files_by_directory(path)) + dirs_and_files[path] = fastq_utils.collect_fastq_files_by_directory(path) + logging.info(f"Added files from {path} to dirs_and_files: {dirs_and_files}") with Manager() as manager: lock = manager.Lock() - for _, file_list in dirs_and_files.items(): - try: - engine = Engine(self, paths, lock) - data_output = pool.imap_unordered(engine, file_list) - except Exception as e: - _log(f'Error {e}') - else: - pool.close() - pool.join() - [data_found_one.append(output) for output in data_output if output] + for path, rel_path in dirs_and_files.items(): + for _, file_list in rel_path.items(): + try: + logging.info(f"Passing paths {paths} to engine.") + engine = Engine(self, paths, lock) + data_output = pool.imap_unordered(engine, file_list) + except Exception as e: + _log(f"Error {e}") + else: + pool.close() + pool.join() + [data_found_one.append(output) for output in data_output if output] if len(data_found_one) > 0: self.errors = data_found_one def main(): - parser = argparse.ArgumentParser(description='Validate FASTQ files.') - parser.add_argument('filepaths', type=Path, nargs='+', - help="Files to validate for FASTQ syntax") + parser = argparse.ArgumentParser(description="Validate FASTQ files.") + parser.add_argument( + "filepaths", type=Path, nargs="+", help="Files to validate for FASTQ syntax" + ) args = parser.parse_args() @@ -239,5 +258,5 @@ def main(): validator.validate_fastq_files_in_path(args.filepaths, pool) -if __name__ == '__main__': +if __name__ == "__main__": main() From ff856b5a07731ada6e7fb0f8d95400f2e6321141 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Tue, 23 Jan 2024 11:59:51 -0500 Subject: [PATCH 05/30] changing threading back to original paradigm --- .../fastq_validator.py | 5 +- .../fastq_validator_logic.py | 117 ++++++++---------- src/ingest_validation_tests/gz_validator.py | 12 +- .../ome_tiff_validator.py | 39 ------ src/ingest_validation_tests/tiff_validator.py | 53 +------- 5 files changed, 68 insertions(+), 158 deletions(-) diff --git a/src/ingest_validation_tests/fastq_validator.py b/src/ingest_validation_tests/fastq_validator.py index dca270b..d31335f 100644 --- a/src/ingest_validation_tests/fastq_validator.py +++ b/src/ingest_validation_tests/fastq_validator.py @@ -10,7 +10,8 @@ class FASTQValidator(Validator): cost = 15.0 def collect_errors(self, **kwargs) -> List[str]: - _log(f'Threading at {self.threads}') + threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1 + _log(f"Threading at {threads}") validator = FASTQValidatorLogic(verbose=True) - validator.validate_fastq_files_in_path(self.paths, self.pool) + validator.validate_fastq_files_in_path(self.paths, threads) return validator.errors diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index e9569d7..315d2ae 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -16,7 +16,10 @@ def is_valid_filename(filename: str) -> bool: def _open_fastq_file(file: Path) -> TextIO: - return gzip.open(file, "rt") if file.name.endswith(".gz") else file.open() + return ( + gzip.open(file, 'rt') if file.name.endswith('.gz') + else file.open() + ) def _log(message: str) -> str: @@ -31,10 +34,12 @@ def __init__(self, validate_object, paths: List[Path], lock): self.lock = lock def __call__(self, fastq_file): + errors = [] for path in self.paths: _log(f"Validating matching fastq files in {path.as_posix()}") self.validate_object.validate_fastq_file(path / fastq_file, self.lock) - return next(iter(self.validate_object.errors), None) + errors.append(next(iter(self.validate_object.errors), None)) + return errors class FASTQValidatorLogic: @@ -55,13 +60,13 @@ class FASTQValidatorLogic: # This pattern seeks out the string that includes the lane number (since # that is expected to be present to help anchor the prefix) that comes # before any of _I1, _I2, _R1, or _R2. - _FASTQ_FILE_PREFIX_REGEX = re.compile(r"(.+_L\d+.*)_[IR][12][._]") + _FASTQ_FILE_PREFIX_REGEX = re.compile(r'(.+_L\d+.*)_[IR][12][._]') - _FASTQ_LINE_2_VALID_CHARS = "ACGNT" + _FASTQ_LINE_2_VALID_CHARS = 'ACGNT' def __init__(self, verbose=False): self.errors: List[str] = [] - self._filename = "" + self._filename = '' self._line_number = 0 self._file_record_counts = Manager().dict() self._file_prefix_counts = Manager().dict() @@ -83,7 +88,7 @@ def _format_error(self, error: str) -> str: return message def _validate_fastq_line_1(self, line: str) -> List[str]: - if not line or line[0] != "@": + if not line or line[0] != '@': return ["Line does not begin with '@'."] return [] @@ -92,54 +97,45 @@ def _validate_fastq_line_2(self, line: str) -> List[str]: self._line_2_length = len(line) self._last_line_2_number = self._line_number - invalid_chars = "".join( - c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS - ) + invalid_chars = ''.join( + c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS) if invalid_chars: return [f"Line contains invalid character(s): {invalid_chars}"] return [] def _validate_fastq_line_3(self, line: str) -> List[str]: - if not line or line[0] != "+": + if not line or line[0] != '+': return ["Line does not begin with '+'."] return [] def _validate_fastq_line_4(self, line: str) -> List[str]: errors: List[str] = [] - invalid_chars = "".join(c for c in line if not 33 <= ord(c) <= 126) + invalid_chars = ''.join(c for c in line if not 33 <= ord(c) <= 126) if invalid_chars: - errors.append( - "Line contains invalid quality character(s): " f'"{invalid_chars}"' - ) + errors.append("Line contains invalid quality character(s): " + f'"{invalid_chars}"') if len(line) != self._line_2_length: - errors.append( - f"Line contains {len(line)} characters which " - f"does not match line {self._last_line_2_number}'s " - f"{self._line_2_length} characters." - ) - + errors.append(f"Line contains {len(line)} characters which " + f"does not match line {self._last_line_2_number}'s " + f"{self._line_2_length} characters.") return errors - _VALIDATE_FASTQ_LINE_METHODS = { - 1: _validate_fastq_line_1, - 2: _validate_fastq_line_2, - 3: _validate_fastq_line_3, - 4: _validate_fastq_line_4, - } + _VALIDATE_FASTQ_LINE_METHODS = {1: _validate_fastq_line_1, + 2: _validate_fastq_line_2, + 3: _validate_fastq_line_3, + 4: _validate_fastq_line_4} def validate_fastq_record(self, line: str, line_number: int) -> List[str]: line_index = line_number % 4 + 1 - validator_method: Callable[ - [FASTQValidatorLogic, str], List[str] - ] = self._VALIDATE_FASTQ_LINE_METHODS[line_index] + validator_method: Callable[[FASTQValidatorLogic, str], List[str]] = \ + self._VALIDATE_FASTQ_LINE_METHODS[line_index] - assert ( - validator_method - ), f"No validator method defined for record index {line_index}" + assert validator_method, \ + f"No validator method defined for record index {line_index}" return validator_method(self, line) @@ -151,8 +147,8 @@ def validate_fastq_stream(self, fastq_data: TextIO) -> int: for line_count, line in enumerate(fastq_data): self._line_number = line_count + 1 self.errors.extend( - self._format_error(error) - for error in self.validate_fastq_record(line.rstrip(), line_count) + self._format_error(error) for error in + self.validate_fastq_record(line.rstrip(), line_count) ) return line_count + 1 @@ -163,11 +159,9 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None: if not is_valid_filename(fastq_file.name): # If we don't like the filename, don't bother reading the contents. - self.errors.append( - _log( - "Filename does not have proper format " "and will not be processed" - ) - ) + self.errors.append(_log( + "Filename does not have proper format " + "and will not be processed")) return self._line_number = 0 @@ -178,16 +172,14 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None: records_read = self.validate_fastq_stream(fastq_data) # TODO: Add gzip.BadGzipFile when Python 3.8 is available except IOError: - self.errors.append(self._format_error("Unable to open FASTQ data file.")) + self.errors.append( + self._format_error("Unable to open FASTQ data file.")) return if fastq_file.name in self._file_record_counts.keys(): - self.errors.append( - _log( - f"{fastq_file.name} has been found multiple times during this " - "validation." - ) - ) + self.errors.append(_log( + f"{fastq_file.name} has been found multiple times during this " + "validation.")) self._file_record_counts[fastq_file.name] = records_read match = self._FASTQ_FILE_PREFIX_REGEX.match(fastq_file.name) @@ -200,22 +192,18 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None: # Find a file we've validated already that matches this # prefix. extant_files = [ - filename - for filename, record_count in self._file_record_counts.items() - if record_count == extant_count - and filename.startswith(filename_prefix) + filename for filename, record_count + in self._file_record_counts.items() + if record_count == extant_count and filename.startswith(filename_prefix) ] # Based on how the dictionaries are created, there should # always be at least one matching filename. assert extant_files - self.errors.append( - _log( - f"{fastq_file.name} ({records_read} lines) " - f"does not match length of {extant_files[0]} " - f"({extant_count} lines)." - ) - ) + self.errors.append(_log( + f"{fastq_file.name} ({records_read} lines) " + f"does not match length of {extant_files[0]} " + f"({extant_count} lines).")) else: self._file_prefix_counts[filename_prefix] = records_read @@ -224,7 +212,7 @@ def validate_fastq_files_in_path(self, paths: List[Path], pool: pool.Pool) -> No dirs_and_files = defaultdict(dict) for path in paths: dirs_and_files[path] = fastq_utils.collect_fastq_files_by_directory(path) - logging.info(f"Added files from {path} to dirs_and_files: {dirs_and_files}") + _log(f"Added files from {path} to dirs_and_files: {dirs_and_files}") with Manager() as manager: lock = manager.Lock() for path, rel_path in dirs_and_files.items(): @@ -245,18 +233,15 @@ def validate_fastq_files_in_path(self, paths: List[Path], pool: pool.Pool) -> No def main(): - parser = argparse.ArgumentParser(description="Validate FASTQ files.") - parser.add_argument( - "filepaths", type=Path, nargs="+", help="Files to validate for FASTQ syntax" - ) + parser = argparse.ArgumentParser(description='Validate FASTQ files.') + parser.add_argument('filepaths', type=Path, nargs='+', + help="Files to validate for FASTQ syntax") args = parser.parse_args() validator = FASTQValidatorLogic(True) - threads = cpu_count() // 4 or 1 - pool = Pool(threads) - validator.validate_fastq_files_in_path(args.filepaths, pool) + validator.validate_fastq_files_in_path(args.filepaths) -if __name__ == "__main__": +if __name__ == '__main__': main() diff --git a/src/ingest_validation_tests/gz_validator.py b/src/ingest_validation_tests/gz_validator.py index 389ba08..e6ca6d5 100644 --- a/src/ingest_validation_tests/gz_validator.py +++ b/src/ingest_validation_tests/gz_validator.py @@ -1,3 +1,5 @@ +from multiprocessing import Pool +from os import cpu_count import re from typing import List @@ -33,14 +35,16 @@ class GZValidator(Validator): def collect_errors(self, **kwargs) -> List[str]: data_output2 = [] - _log(f'Threading at {self.threads}') + threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1 + _log(f'Threading at {threads}') try: + pool = Pool(threads) engine = Engine() - data_output = self.pool.imap_unordered(engine, self.paths) + data_output = pool.imap_unordered(engine, self.paths) except Exception as e: _log(f'Error {e}') else: - self.pool.close() - self.pool.join() + pool.close() + pool.join() [data_output2.append(output) for output in data_output if output] return data_output2 diff --git a/src/ingest_validation_tests/ome_tiff_validator.py b/src/ingest_validation_tests/ome_tiff_validator.py index 063fd7d..2979035 100644 --- a/src/ingest_validation_tests/ome_tiff_validator.py +++ b/src/ingest_validation_tests/ome_tiff_validator.py @@ -6,26 +6,6 @@ from ingest_validation_tools.plugin_validator import Validator -# def _check_ome_tiff_file(paths: str): -# rslt_list = [] -# for path in paths: -# try: -# with tifffile.TiffFile(path) as tf: -# xml_document = xmlschema.XmlDocument(tf.ome_metadata) -# if not xml_document.schema.is_valid(xml_document): -# return f"{path} is not a valid OME.TIFF file" -# except Exception as excp: -# rslt_list.append(f"{path} is not a valid OME.TIFF file: {excp}") -# return rslt_list -# -# def _glob_paths(path: str): -# filenames_to_test = [] -# for glob_expr in ['**/*.ome.tif', '**/*.ome.tiff', '**/*.OME.TIFF', '**/*.OME.TIF']: -# for file in path.glob(glob_expr): -# filenames_to_test.append(file) -# return filenames_to_test -# -# class OmeTiffValidator(Validator): description = "Recursively test all ome-tiff files for validity" cost = 1.0 @@ -44,22 +24,3 @@ def collect_errors(self, **kwargs) -> List[str]: except Exception as excp: rslt.append(f'{file} is not a valid OME.TIFF file: {excp}') return rslt - # def collect_errors(self, **kwargs) -> List[str]: - # del kwargs - # filenames_to_test = list( - # file - # for file in self.pool.imap_unordered(_glob_paths, self.paths) - # if file is not None - # ) - # # TODO: this is awful - # return list( - # x - # for y in list( - # rslt - # for rslt in self.pool.imap_unordered( - # _check_ome_tiff_file, filenames_to_test - # ) - # if rslt is not None - # ) - # for x in y - # ) diff --git a/src/ingest_validation_tests/tiff_validator.py b/src/ingest_validation_tests/tiff_validator.py index 2b22ec1..ab033e7 100644 --- a/src/ingest_validation_tests/tiff_validator.py +++ b/src/ingest_validation_tests/tiff_validator.py @@ -33,59 +33,18 @@ def _check_tiff_file(path: str) -> str or None: return f"{path} is not a valid TIFF file: {excp}" -# def _check_tiff_file(paths: List) -> Optional[str]: -# rslt_list = [] -# for path in paths: -# try: -# with tifffile.TiffFile(path) as tfile: -# for page in tfile.pages: -# _ = page.asarray() # force decompression -# return None -# except Exception as excp: -# _log(f"{str(path)} is not a valid TIFF file: {excp}") -# rslt_list.append(f"{path} is not a valid TIFF file: {excp}") -# return rslt_list -# - - -# def _glob_paths(path: str): -# filenames_to_test = [] -# for glob_expr in ["**/*.tif", "**/*.tiff", "**/*.TIFF", "**/*.TIF"]: -# for file in path.glob(glob_expr): -# filenames_to_test.append(file) -# return filenames_to_test - - class TiffValidator(Validator): description = "Recursively test all tiff files that are not ome.tiffs for validity" cost = 1.0 def collect_errors(self, **kwargs) -> List[str]: - del kwargs + threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1 + pool = Pool(threads) filenames_to_test = [] - for glob_expr in ["**/*.tif", "**/*.tiff", "**/*.TIFF", "**/*.TIF"]: + for glob_expr in ['**/*.tif', '**/*.tiff', '**/*.TIFF', '**/*.TIF']: for path in self.paths: for file in path.glob(glob_expr): filenames_to_test.append(file) - return list( - rslt - for rslt in self.pool.imap_unordered(_check_tiff_file, filenames_to_test) - if rslt is not None - ) - # filenames_to_test = list( - # file - # for file in self.pool.imap_unordered(_glob_paths, self.paths) - # if file is not None - # ) - # # TODO: this is awful - # return list( - # x - # for y in list( - # rslt - # for rslt in self.pool.imap_unordered( - # _check_tiff_file, filenames_to_test - # ) - # if rslt is not None - # ) - # for x in y - # ) + return list(rslt for rslt in pool.imap_unordered(_check_tiff_file, + filenames_to_test) + if rslt is not None) From 1a2626aea6dbb04a5e7ed12177de268de5c27122 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Tue, 23 Jan 2024 12:28:46 -0500 Subject: [PATCH 06/30] bugfixing --- src/ingest_validation_tests/codex_json_validator.py | 1 + src/ingest_validation_tests/fastq_validator_logic.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/ingest_validation_tests/codex_json_validator.py b/src/ingest_validation_tests/codex_json_validator.py index 5700cfe..1e9e9ba 100644 --- a/src/ingest_validation_tests/codex_json_validator.py +++ b/src/ingest_validation_tests/codex_json_validator.py @@ -11,6 +11,7 @@ class CodexJsonValidator(Validator): cost = 1.0 def collect_errors(self, **kwargs) -> List[str]: + del kwargs if 'codex' not in self.assay_type.lower(): return [] diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index 315d2ae..0065f06 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -3,8 +3,7 @@ import logging import re from collections import defaultdict -from multiprocessing import Manager, Pool, pool -from os import cpu_count +from multiprocessing import Lock, Manager, Pool from pathlib import Path from typing import Callable, List, TextIO @@ -207,7 +206,7 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None: else: self._file_prefix_counts[filename_prefix] = records_read - def validate_fastq_files_in_path(self, paths: List[Path], pool: pool.Pool) -> None: + def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: data_found_one = [] dirs_and_files = defaultdict(dict) for path in paths: @@ -219,6 +218,7 @@ def validate_fastq_files_in_path(self, paths: List[Path], pool: pool.Pool) -> No for _, file_list in rel_path.items(): try: logging.info(f"Passing paths {paths} to engine.") + pool = Pool(threads) engine = Engine(self, paths, lock) data_output = pool.imap_unordered(engine, file_list) except Exception as e: @@ -240,7 +240,7 @@ def main(): args = parser.parse_args() validator = FASTQValidatorLogic(True) - validator.validate_fastq_files_in_path(args.filepaths) + validator.validate_fastq_files_in_path(args.filepaths, Lock()) if __name__ == '__main__': From adc24ef65881075cb3475450cde9607f8351be97 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Tue, 23 Jan 2024 13:58:29 -0500 Subject: [PATCH 07/30] compare full paths rather than filenames while making sure there are not duplicate files in a dir --- src/ingest_validation_tests/fastq_validator.py | 6 +++--- src/ingest_validation_tests/fastq_validator_logic.py | 8 ++++---- src/ingest_validation_tests/tiff_validator.py | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/ingest_validation_tests/fastq_validator.py b/src/ingest_validation_tests/fastq_validator.py index d31335f..0a30e42 100644 --- a/src/ingest_validation_tests/fastq_validator.py +++ b/src/ingest_validation_tests/fastq_validator.py @@ -1,8 +1,8 @@ from os import cpu_count from typing import List -from fastq_validator_logic import FASTQValidatorLogic, _log from ingest_validation_tools.plugin_validator import Validator +from fastq_validator_logic import FASTQValidatorLogic, _log class FASTQValidator(Validator): @@ -10,8 +10,8 @@ class FASTQValidator(Validator): cost = 15.0 def collect_errors(self, **kwargs) -> List[str]: - threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1 - _log(f"Threading at {threads}") + threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1 + _log(f'Threading at {threads}') validator = FASTQValidatorLogic(verbose=True) validator.validate_fastq_files_in_path(self.paths, threads) return validator.errors diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index 0065f06..0b78e21 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -37,7 +37,7 @@ def __call__(self, fastq_file): for path in self.paths: _log(f"Validating matching fastq files in {path.as_posix()}") self.validate_object.validate_fastq_file(path / fastq_file, self.lock) - errors.append(next(iter(self.validate_object.errors), None)) + errors.append(next(iter(self.validate_object.errors))) return errors @@ -175,11 +175,11 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None: self._format_error("Unable to open FASTQ data file.")) return - if fastq_file.name in self._file_record_counts.keys(): + if fastq_file in self._file_record_counts.keys(): self.errors.append(_log( - f"{fastq_file.name} has been found multiple times during this " + f"{fastq_file} has been found ultiple times during this " "validation.")) - self._file_record_counts[fastq_file.name] = records_read + self._file_record_counts[fastq_file] = records_read match = self._FASTQ_FILE_PREFIX_REGEX.match(fastq_file.name) with lock: diff --git a/src/ingest_validation_tests/tiff_validator.py b/src/ingest_validation_tests/tiff_validator.py index ab033e7..57aba4d 100644 --- a/src/ingest_validation_tests/tiff_validator.py +++ b/src/ingest_validation_tests/tiff_validator.py @@ -1,7 +1,6 @@ from multiprocessing import Pool from os import cpu_count -from pathlib import Path -from typing import List, Optional +from typing import List import tifffile from ingest_validation_tools.plugin_validator import Validator @@ -41,6 +40,7 @@ def collect_errors(self, **kwargs) -> List[str]: threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1 pool = Pool(threads) filenames_to_test = [] + # TODO: this does not exclude OME.TIFF files, should it? for glob_expr in ['**/*.tif', '**/*.tiff', '**/*.TIFF', '**/*.TIF']: for path in self.paths: for file in path.glob(glob_expr): From be508e359710e060d35f137363145c38141ca231 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Wed, 24 Jan 2024 15:55:07 -0500 Subject: [PATCH 08/30] making tests pass --- requirements.txt | 10 ++-- .../codex_common_errors_validator.py | 15 ++++- .../fastq_validator_logic.py | 44 +++++++++------ src/ingest_validation_tests/gz_validator.py | 55 ++++++++++--------- tests/test_codex_common_errors_validator.py | 2 +- tests/test_fastq_validator_logic.py | 14 ++--- 6 files changed, 81 insertions(+), 59 deletions(-) diff --git a/requirements.txt b/requirements.txt index e5a685b..9ac14b6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ -xmlschema>=1.6 -tifffile==2020.10.1 +git+https://github.com/hubmapconsortium/fastq-utils.git@v0.2.5#egg=hubmap-fastq-utils +imagecodecs>=2023.3.16 jsonschema==4.4.0 pandas>=1.2.0 -imagecodecs>=2023.3.16 -python-frontmatter>=1.0.0 -git+https://github.com/hubmapconsortium/fastq-utils.git@v0.2.5#egg=hubmap-fastq-utils +python-frontmatter>=1.1.0 +tifffile==2020.10.1 +xmlschema>=1.6 diff --git a/src/ingest_validation_tests/codex_common_errors_validator.py b/src/ingest_validation_tests/codex_common_errors_validator.py index e98c3c6..8954b05 100644 --- a/src/ingest_validation_tests/codex_common_errors_validator.py +++ b/src/ingest_validation_tests/codex_common_errors_validator.py @@ -58,6 +58,7 @@ def collect_errors(self, **kwargs) -> List[str]: rslts = [] for path in self.paths: rslt = [] + # TODO: this seems to be replicating dir validation? Is this necessary for legacy support? try: # is the raw/src_ directory present? prefix = None @@ -90,8 +91,8 @@ def collect_errors(self, **kwargs) -> List[str]: # is the segmentation.json file on the right side? found = False right_place = False - for path in path.glob('*/[Ss]egmentation.json'): - rel_path = path.relative_to(path) + for filepath in path.glob('*/[Ss]egmentation.json'): + rel_path = filepath.relative_to(path) found = True if str(rel_path).startswith(('raw', 'src_')): right_place = True @@ -113,6 +114,7 @@ def collect_errors(self, **kwargs) -> List[str]: if not channelnames_txt_path.is_file(): rslt.append('channelnames.txt is missing') raise QuitNowException() + # TODO: end questionable portion # Parse channelnames.txt into a dataframe try: @@ -214,8 +216,15 @@ def collect_errors(self, **kwargs) -> List[str]: if failures: rslt += failures raise QuitNowException() - rslts.append(rslt) + if type(rslt) is list: + rslts.extend(rslt) + else: + rslts.append(rslt) except QuitNowException: + if type(rslt) is list: + rslts.extend(rslt) + else: + rslts.append(rslt) pass return rslts diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index 0b78e21..9e212b3 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -27,17 +27,17 @@ def _log(message: str) -> str: class Engine(object): - def __init__(self, validate_object, paths: List[Path], lock): + def __init__(self, validate_object, path: Path, lock): self.validate_object = validate_object - self.paths = paths + self.path = path self.lock = lock def __call__(self, fastq_file): errors = [] - for path in self.paths: - _log(f"Validating matching fastq files in {path.as_posix()}") - self.validate_object.validate_fastq_file(path / fastq_file, self.lock) - errors.append(next(iter(self.validate_object.errors))) + _log(f"Validating matching fastq files in {self.path}") + self.validate_object.validate_fastq_file(self.path / fastq_file, self.lock) + for err in self.validate_object.errors: + errors.append(err) return errors @@ -174,12 +174,13 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None: self.errors.append( self._format_error("Unable to open FASTQ data file.")) return - - if fastq_file in self._file_record_counts.keys(): + # TODO: this locates duplicate filenames across dirs, is that right? + # Difficult to log instances because first is not captured. + if fastq_file.name in self._file_record_counts.keys(): self.errors.append(_log( - f"{fastq_file} has been found ultiple times during this " + f"{fastq_file.name} has been found multiple times during this " "validation.")) - self._file_record_counts[fastq_file] = records_read + self._file_record_counts[fastq_file.name] = records_read match = self._FASTQ_FILE_PREFIX_REGEX.match(fastq_file.name) with lock: @@ -214,19 +215,20 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: _log(f"Added files from {path} to dirs_and_files: {dirs_and_files}") with Manager() as manager: lock = manager.Lock() - for path, rel_path in dirs_and_files.items(): - for _, file_list in rel_path.items(): + for path, rel_paths in dirs_and_files.items(): + for rel_path, file_list in rel_paths.items(): + fullpath = Path(path / rel_path) try: - logging.info(f"Passing paths {paths} to engine.") + logging.info(f"Passing paths {fullpath} to engine.") pool = Pool(threads) - engine = Engine(self, paths, lock) + engine = Engine(self, fullpath, lock) data_output = pool.imap_unordered(engine, file_list) except Exception as e: _log(f"Error {e}") else: pool.close() pool.join() - [data_found_one.append(output) for output in data_output if output] + [data_found_one.extend(output) for output in data_output if output] if len(data_found_one) > 0: self.errors = data_found_one @@ -238,9 +240,19 @@ def main(): help="Files to validate for FASTQ syntax") args = parser.parse_args() + if isinstance(args.filepaths, List): + filepaths = [Path(path) for path in args.filepaths] + elif isinstance(args.filepaths, Path): + filepaths = [args.filepaths] + elif isinstance(args.filepaths, str): + filepaths = [Path(args.filepaths)] + else: + raise Exception( + f"Validator init received base_paths arg as type {type(args.filepaths)}" + ) validator = FASTQValidatorLogic(True) - validator.validate_fastq_files_in_path(args.filepaths, Lock()) + validator.validate_fastq_files_in_path(filepaths, Lock()) if __name__ == '__main__': diff --git a/src/ingest_validation_tests/gz_validator.py b/src/ingest_validation_tests/gz_validator.py index e6ca6d5..c29653f 100644 --- a/src/ingest_validation_tests/gz_validator.py +++ b/src/ingest_validation_tests/gz_validator.py @@ -12,21 +12,20 @@ def _log(message: str): class Engine(object): - def __call__(self, path): - for filename in path.glob('**/*.gz'): - excluded = r'.*/*fastq.gz' - if re.search(excluded, filename.as_posix()): - return - try: - _log(f'Threaded {filename}') - with gzip.open(filename) as g_f: - while True: - buf = g_f.read(1024*1024) - if not buf: - break - except Exception as e: - _log(f'{filename} is not a valid gzipped file {e}') - return f'{filename} is not a valid gzipped file' + def __call__(self, filename): + excluded = r".*/*fastq.gz" + if re.search(excluded, filename.as_posix()): + return + try: + _log(f"Threaded {filename}") + with gzip.open(filename) as g_f: + while True: + buf = g_f.read(1024 * 1024) + if not buf: + break + except Exception as e: + _log(f"{filename} is not a valid gzipped file {e}") + return f"{filename} is not a valid gzipped file" class GZValidator(Validator): @@ -35,16 +34,18 @@ class GZValidator(Validator): def collect_errors(self, **kwargs) -> List[str]: data_output2 = [] - threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1 - _log(f'Threading at {threads}') - try: - pool = Pool(threads) - engine = Engine() - data_output = pool.imap_unordered(engine, self.paths) - except Exception as e: - _log(f'Error {e}') - else: - pool.close() - pool.join() - [data_output2.append(output) for output in data_output if output] + threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1 + _log(f"Threading at {threads}") + for path in self.paths: + for glob_expr in ["**/*.gz"]: + try: + pool = Pool(threads) + engine = Engine() + data_output = pool.imap_unordered(engine, path.glob(glob_expr)) + except Exception as e: + _log(f"Error {e}") + else: + pool.close() + pool.join() + [data_output2.append(output) for output in data_output if output] return data_output2 diff --git a/tests/test_codex_common_errors_validator.py b/tests/test_codex_common_errors_validator.py index 256584c..23cd5e7 100644 --- a/tests/test_codex_common_errors_validator.py +++ b/tests/test_codex_common_errors_validator.py @@ -40,7 +40,7 @@ def test_codex_common_errors_validator(test_data_fname, msg_starts_list, tmp_pat test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = CodexCommonErrorsValidator(tmp_path / test_data_path.stem, + validator = CodexCommonErrorsValidator([Path(tmp_path / test_data_path.stem)], 'CODEX' ) errors = validator.collect_errors()[:] diff --git a/tests/test_fastq_validator_logic.py b/tests/test_fastq_validator_logic.py index 967c2c4..55dadef 100644 --- a/tests/test_fastq_validator_logic.py +++ b/tests/test_fastq_validator_logic.py @@ -39,7 +39,7 @@ def fastq_validator(self) -> FASTQValidatorLogic: return FASTQValidatorLogic() def test_fastq_validator_no_files(self, fastq_validator, tmp_path): - fastq_validator.validate_fastq_files_in_path(tmp_path, 2) + fastq_validator.validate_fastq_files_in_path([tmp_path], 2) # This case should return no errors assert fastq_validator.errors == [] @@ -65,7 +65,7 @@ def test_fastq_validator_unrecognized_file(self, fastq_validator, def test_fastq_validator_empty_directory(self, fastq_validator, tmp_path): - fastq_validator.validate_fastq_files_in_path(tmp_path, 2) + fastq_validator.validate_fastq_files_in_path([tmp_path], 2) # No files in path means no errors assert fastq_validator.errors == [] @@ -76,7 +76,7 @@ def test_fastq_validator_basic(self, fastq_validator, tmp_path, use_gzip): with _open_output_file(test_file, use_gzip) as output: output.write(_GOOD_RECORDS) - fastq_validator.validate_fastq_files_in_path(tmp_path, 2) + fastq_validator.validate_fastq_files_in_path([tmp_path], 2) assert not fastq_validator.errors def test_fastq_validator_bad_file(self, fastq_validator, tmp_path): @@ -84,7 +84,7 @@ def test_fastq_validator_bad_file(self, fastq_validator, tmp_path): with _open_output_file(test_file, False) as output: output.write('ABCDEF') - fastq_validator.validate_fastq_files_in_path(tmp_path, 2) + fastq_validator.validate_fastq_files_in_path([tmp_path], 2) # This test does not assert specific validations but instead that the # overall file failed and that error messages were returned. @@ -98,7 +98,7 @@ def test_fastq_validator_duplicate_file(self, fastq_validator, tmp_path): False) as output: output.write(_GOOD_RECORDS) - fastq_validator.validate_fastq_files_in_path(tmp_path, 2) + fastq_validator.validate_fastq_files_in_path([tmp_path], 2) assert "test.fastq has been found multiple times" in \ fastq_validator.errors[0] @@ -192,7 +192,7 @@ def test_fastq_validator_record_counts_good(self, fastq_validator, with _open_output_file(new_file, False) as output: output.write(_GOOD_RECORDS) - fastq_validator.validate_fastq_files_in_path(tmp_path, 2) + fastq_validator.validate_fastq_files_in_path([tmp_path], 2) assert not fastq_validator.errors @@ -208,7 +208,7 @@ def test_fastq_validator_record_counts_bad(self, fastq_validator, output.write(_GOOD_RECORDS) output.write(_GOOD_RECORDS) - fastq_validator.validate_fastq_files_in_path(tmp_path, 2) + fastq_validator.validate_fastq_files_in_path([tmp_path], 2) # Order of the files being processed is not guaranteed, however these # strings ensure that a mismatch was found. From 7d9d55ad5ee6f5a4f41149d0f464f90007b5e3d8 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Wed, 24 Jan 2024 16:36:37 -0500 Subject: [PATCH 09/30] small linting change --- src/ingest_validation_tests/ome_tiff_validator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ingest_validation_tests/ome_tiff_validator.py b/src/ingest_validation_tests/ome_tiff_validator.py index 2979035..e4df25a 100644 --- a/src/ingest_validation_tests/ome_tiff_validator.py +++ b/src/ingest_validation_tests/ome_tiff_validator.py @@ -1,5 +1,4 @@ from typing import List -from pathlib import Path import tifffile import xmlschema From f923db4ac9d89672819dbc3ce47e9886100f5cc7 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 25 Jan 2024 12:09:11 -0500 Subject: [PATCH 10/30] testing different way of passing files for concurrent processing --- .../fastq_validator_logic.py | 36 ++++++++++--------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index 9e212b3..37791cf 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -27,15 +27,14 @@ def _log(message: str) -> str: class Engine(object): - def __init__(self, validate_object, path: Path, lock): + def __init__(self, validate_object, lock): self.validate_object = validate_object - self.path = path self.lock = lock def __call__(self, fastq_file): errors = [] - _log(f"Validating matching fastq files in {self.path}") - self.validate_object.validate_fastq_file(self.path / fastq_file, self.lock) + _log(f"Validating matching fastq file {fastq_file}") + self.validate_object.validate_fastq_file(fastq_file, self.lock) for err in self.validate_object.errors: errors.append(err) return errors @@ -176,6 +175,7 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None: return # TODO: this locates duplicate filenames across dirs, is that right? # Difficult to log instances because first is not captured. + # ALSO this doesn't work reliably when more threads are running concurrently if fastq_file.name in self._file_record_counts.keys(): self.errors.append(_log( f"{fastq_file.name} has been found multiple times during this " @@ -213,22 +213,24 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: for path in paths: dirs_and_files[path] = fastq_utils.collect_fastq_files_by_directory(path) _log(f"Added files from {path} to dirs_and_files: {dirs_and_files}") + file_list = [] with Manager() as manager: lock = manager.Lock() for path, rel_paths in dirs_and_files.items(): - for rel_path, file_list in rel_paths.items(): - fullpath = Path(path / rel_path) - try: - logging.info(f"Passing paths {fullpath} to engine.") - pool = Pool(threads) - engine = Engine(self, fullpath, lock) - data_output = pool.imap_unordered(engine, file_list) - except Exception as e: - _log(f"Error {e}") - else: - pool.close() - pool.join() - [data_found_one.extend(output) for output in data_output if output] + for rel_path, files in rel_paths.items(): + for file in files: + file_list.append(Path(path / rel_path / file)) + try: + logging.info(f"Passing file list for paths {paths} to engine. File list: {file_list}.") + pool = Pool(threads) + engine = Engine(self, lock) + data_output = pool.imap_unordered(engine, file_list) + except Exception as e: + _log(f"Error {e}") + else: + pool.close() + pool.join() + [data_found_one.extend(output) for output in data_output if output] if len(data_found_one) > 0: self.errors = data_found_one From 5e7bd34fe4067c5f3aadf0dc4ce43934f30968bd Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 25 Jan 2024 12:19:51 -0500 Subject: [PATCH 11/30] replicating changes to ome_tiff/tiff checking --- .../ome_tiff_validator.py | 38 +++++++++++++------ src/ingest_validation_tests/tiff_validator.py | 4 +- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/src/ingest_validation_tests/ome_tiff_validator.py b/src/ingest_validation_tests/ome_tiff_validator.py index e4df25a..e7e758d 100644 --- a/src/ingest_validation_tests/ome_tiff_validator.py +++ b/src/ingest_validation_tests/ome_tiff_validator.py @@ -1,3 +1,5 @@ +from multiprocessing import Pool +from os import cpu_count from typing import List import tifffile @@ -5,21 +7,35 @@ from ingest_validation_tools.plugin_validator import Validator +def _check_ome_tiff_file(file: str): + try: + with tifffile.TiffFile(file) as tf: + xml_document = xmlschema.XmlDocument(tf.ome_metadata) + if xml_document.schema and not xml_document.schema.is_valid(xml_document): + return f"{file} is not a valid OME.TIFF file" + except Exception as excp: + return f"{file} is not a valid OME.TIFF file: {excp}" + + class OmeTiffValidator(Validator): description = "Recursively test all ome-tiff files for validity" cost = 1.0 def collect_errors(self, **kwargs) -> List[str]: - del kwargs - rslt = [] - for glob_expr in ['**/*.ome.tif', '**/*.ome.tiff', '**/*.OME.TIFF', '**/*.OME.TIF']: + threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1 + pool = Pool(threads) + filenames_to_test = [] + for glob_expr in [ + "**/*.ome.tif", + "**/*.ome.tiff", + "**/*.OME.TIFF", + "**/*.OME.TIF", + ]: for path in self.paths: for file in path.glob(glob_expr): - try: - with tifffile.TiffFile(file) as tf: - xml_document = xmlschema.XmlDocument(tf.ome_metadata) - if not xml_document.schema.is_valid(xml_document): - rslt.append(f'{file} is not a valid OME.TIFF file') - except Exception as excp: - rslt.append(f'{file} is not a valid OME.TIFF file: {excp}') - return rslt + filenames_to_test.append(file) + return list( + rslt + for rslt in pool.imap_unordered(_check_ome_tiff_file, filenames_to_test) + if rslt is not None + ) diff --git a/src/ingest_validation_tests/tiff_validator.py b/src/ingest_validation_tests/tiff_validator.py index 57aba4d..8615646 100644 --- a/src/ingest_validation_tests/tiff_validator.py +++ b/src/ingest_validation_tests/tiff_validator.py @@ -1,6 +1,6 @@ from multiprocessing import Pool from os import cpu_count -from typing import List +from typing import List, Optional import tifffile from ingest_validation_tools.plugin_validator import Validator @@ -21,7 +21,7 @@ def _log(message: str): print(message) -def _check_tiff_file(path: str) -> str or None: +def _check_tiff_file(path: str) -> Optional[str]: try: with tifffile.TiffFile(path) as tfile: for page in tfile.pages: From dad0b7d679ea797b7014e32f3234405f3a8413a4 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 25 Jan 2024 12:26:49 -0500 Subject: [PATCH 12/30] updated gz_validator --- src/ingest_validation_tests/gz_validator.py | 22 +++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/ingest_validation_tests/gz_validator.py b/src/ingest_validation_tests/gz_validator.py index c29653f..58ad671 100644 --- a/src/ingest_validation_tests/gz_validator.py +++ b/src/ingest_validation_tests/gz_validator.py @@ -36,16 +36,18 @@ def collect_errors(self, **kwargs) -> List[str]: data_output2 = [] threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1 _log(f"Threading at {threads}") + file_list = [] for path in self.paths: for glob_expr in ["**/*.gz"]: - try: - pool = Pool(threads) - engine = Engine() - data_output = pool.imap_unordered(engine, path.glob(glob_expr)) - except Exception as e: - _log(f"Error {e}") - else: - pool.close() - pool.join() - [data_output2.append(output) for output in data_output if output] + file_list.extend(path.glob(glob_expr)) + try: + pool = Pool(threads) + engine = Engine() + data_output = pool.imap_unordered(engine, file_list) + except Exception as e: + _log(f"Error {e}") + else: + pool.close() + pool.join() + [data_output2.append(output) for output in data_output if output] return data_output2 From 5b2f835be9bcd42163702f315ea156f05fc81482 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 25 Jan 2024 12:41:27 -0500 Subject: [PATCH 13/30] cleaning up TODOs --- .../codex_common_errors_validator.py | 2 -- src/ingest_validation_tests/fastq_validator_logic.py | 7 +++++-- src/ingest_validation_tests/tiff_validator.py | 1 - tests/test_fastq_validator_logic.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/ingest_validation_tests/codex_common_errors_validator.py b/src/ingest_validation_tests/codex_common_errors_validator.py index 8954b05..391e9cc 100644 --- a/src/ingest_validation_tests/codex_common_errors_validator.py +++ b/src/ingest_validation_tests/codex_common_errors_validator.py @@ -58,7 +58,6 @@ def collect_errors(self, **kwargs) -> List[str]: rslts = [] for path in self.paths: rslt = [] - # TODO: this seems to be replicating dir validation? Is this necessary for legacy support? try: # is the raw/src_ directory present? prefix = None @@ -114,7 +113,6 @@ def collect_errors(self, **kwargs) -> List[str]: if not channelnames_txt_path.is_file(): rslt.append('channelnames.txt is missing') raise QuitNowException() - # TODO: end questionable portion # Parse channelnames.txt into a dataframe try: diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index 37791cf..ad52ee8 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -168,10 +168,13 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None: try: with _open_fastq_file(fastq_file) as fastq_data: records_read = self.validate_fastq_stream(fastq_data) - # TODO: Add gzip.BadGzipFile when Python 3.8 is available + except gzip.BadGzipFile: + self.errors.append( + self._format_error(f"Bad gzip file: {fastq_file}.")) + return except IOError: self.errors.append( - self._format_error("Unable to open FASTQ data file.")) + self._format_error(f"Unable to open FASTQ data file {fastq_file}.")) return # TODO: this locates duplicate filenames across dirs, is that right? # Difficult to log instances because first is not captured. diff --git a/src/ingest_validation_tests/tiff_validator.py b/src/ingest_validation_tests/tiff_validator.py index 8615646..db840c1 100644 --- a/src/ingest_validation_tests/tiff_validator.py +++ b/src/ingest_validation_tests/tiff_validator.py @@ -40,7 +40,6 @@ def collect_errors(self, **kwargs) -> List[str]: threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1 pool = Pool(threads) filenames_to_test = [] - # TODO: this does not exclude OME.TIFF files, should it? for glob_expr in ['**/*.tif', '**/*.tiff', '**/*.TIFF', '**/*.TIF']: for path in self.paths: for file in path.glob(glob_expr): diff --git a/tests/test_fastq_validator_logic.py b/tests/test_fastq_validator_logic.py index 55dadef..797abc5 100644 --- a/tests/test_fastq_validator_logic.py +++ b/tests/test_fastq_validator_logic.py @@ -51,7 +51,7 @@ def test_fastq_validator_bad_gzip_data(self, fastq_validator, tmp_path): output.write(_GOOD_RECORDS) fastq_validator.validate_fastq_file(test_file, Lock()) - assert "Unable to open" in fastq_validator.errors[0] + assert "Bad gzip file" in fastq_validator.errors[0] def test_fastq_validator_unrecognized_file(self, fastq_validator, tmp_path): From 1c39a39e187c689f7d09b8a82002d3cdaa85d611 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 25 Jan 2024 16:55:54 -0500 Subject: [PATCH 14/30] fixing fastq per-dataset validation --- .../fastq_validator_logic.py | 100 ++++++++++-------- tests/test_fastq_validator_logic.py | 8 +- 2 files changed, 58 insertions(+), 50 deletions(-) diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index ad52ee8..6784a8b 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -27,14 +27,13 @@ def _log(message: str) -> str: class Engine(object): - def __init__(self, validate_object, lock): + def __init__(self, validate_object): self.validate_object = validate_object - self.lock = lock def __call__(self, fastq_file): errors = [] _log(f"Validating matching fastq file {fastq_file}") - self.validate_object.validate_fastq_file(fastq_file, self.lock) + self.validate_object.validate_fastq_file(fastq_file) for err in self.validate_object.errors: errors.append(err) return errors @@ -55,19 +54,14 @@ class FASTQValidatorLogic: contain the same number of symbols as letters in the sequence. """ - # This pattern seeks out the string that includes the lane number (since - # that is expected to be present to help anchor the prefix) that comes - # before any of _I1, _I2, _R1, or _R2. - _FASTQ_FILE_PREFIX_REGEX = re.compile(r'(.+_L\d+.*)_[IR][12][._]') - _FASTQ_LINE_2_VALID_CHARS = 'ACGNT' def __init__(self, verbose=False): self.errors: List[str] = [] - self._filename = '' - self._line_number = 0 self._file_record_counts = Manager().dict() self._file_prefix_counts = Manager().dict() + self._filename = '' + self._line_number = 0 self._verbose = verbose @@ -151,7 +145,7 @@ def validate_fastq_stream(self, fastq_data: TextIO) -> int: return line_count + 1 - def validate_fastq_file(self, fastq_file: Path, lock) -> None: + def validate_fastq_file(self, fastq_file: Path) -> None: _log(f"Validating {fastq_file.name}...") _log(f" → {fastq_file.absolute().as_posix()}") @@ -176,39 +170,7 @@ def validate_fastq_file(self, fastq_file: Path, lock) -> None: self.errors.append( self._format_error(f"Unable to open FASTQ data file {fastq_file}.")) return - # TODO: this locates duplicate filenames across dirs, is that right? - # Difficult to log instances because first is not captured. - # ALSO this doesn't work reliably when more threads are running concurrently - if fastq_file.name in self._file_record_counts.keys(): - self.errors.append(_log( - f"{fastq_file.name} has been found multiple times during this " - "validation.")) - self._file_record_counts[fastq_file.name] = records_read - - match = self._FASTQ_FILE_PREFIX_REGEX.match(fastq_file.name) - with lock: - if match: - filename_prefix = match.group(1) - if filename_prefix in self._file_prefix_counts.keys(): - extant_count = self._file_prefix_counts[filename_prefix] - if extant_count != records_read: - # Find a file we've validated already that matches this - # prefix. - extant_files = [ - filename for filename, record_count - in self._file_record_counts.items() - if record_count == extant_count and filename.startswith(filename_prefix) - ] - # Based on how the dictionaries are created, there should - # always be at least one matching filename. - assert extant_files - - self.errors.append(_log( - f"{fastq_file.name} ({records_read} lines) " - f"does not match length of {extant_files[0]} " - f"({extant_count} lines).")) - else: - self._file_prefix_counts[filename_prefix] = records_read + self._file_record_counts[str(fastq_file)] = records_read def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: data_found_one = [] @@ -226,17 +188,63 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: try: logging.info(f"Passing file list for paths {paths} to engine. File list: {file_list}.") pool = Pool(threads) - engine = Engine(self, lock) + engine = Engine(self) data_output = pool.imap_unordered(engine, file_list) except Exception as e: _log(f"Error {e}") else: pool.close() pool.join() + self._find_duplicates(dirs_and_files) + self._find_shared_prefixes(lock) [data_found_one.extend(output) for output in data_output if output] if len(data_found_one) > 0: - self.errors = data_found_one + self.errors.extend(data_found_one) + + def _find_duplicates(self, dirs_and_files): + for data_path, sub_dirs in dirs_and_files.items(): + # Creates a dict of filenames to list of full filepaths for each + # fastq file in a given data_path (dataset dir). + files_per_path = defaultdict(list) + for sub_path, filepaths in sub_dirs.items(): + for filepath in filepaths: + files_per_path[filepath.name].append(data_path / sub_path) + for filename, filepaths in files_per_path.items(): + if len(filepaths) > 1: + self.errors.append(_log( + f"{filename} has been found multiple times during this validation of path {data_path}. Duplicates: {filepaths}.")) # noqa: E501 + + def _find_shared_prefixes(self, lock): + # This pattern seeks out the string that includes the lane number (since + # that is expected to be present to help anchor the prefix) that comes + # before any of _I1, _I2, _R1, or _R2. + fastq_file_prefix_regex = re.compile(r'(.+_L\d+.*)_[IR][12][._]') + for fastq_file, records_read in self._file_record_counts.items(): + match = fastq_file_prefix_regex.match(Path(fastq_file).name) + with lock: + if match: + filename_prefix = match.group(1) + if filename_prefix in self._file_prefix_counts.keys(): + extant_count = self._file_prefix_counts[filename_prefix] + if extant_count != records_read: + # Find a file we've validated already that matches this + # prefix. + extant_files = [ + str(Path(filepath).name) for filepath, record_count + in self._file_record_counts.items() + if record_count == extant_count and Path(filepath).name.startswith(filename_prefix) + ] + # Based on how the dictionaries are created, there should + # always be at least one matching filename. + assert extant_files + + self.errors.append(_log( + f"{Path(fastq_file).name} ({records_read} lines) " + f"does not match length of {extant_files[0]} " + f"({extant_count} lines).")) + else: + self._file_prefix_counts[filename_prefix] = records_read def main(): diff --git a/tests/test_fastq_validator_logic.py b/tests/test_fastq_validator_logic.py index 797abc5..a244cd8 100644 --- a/tests/test_fastq_validator_logic.py +++ b/tests/test_fastq_validator_logic.py @@ -50,7 +50,7 @@ def test_fastq_validator_bad_gzip_data(self, fastq_validator, tmp_path): with _open_output_file(test_file, False) as output: output.write(_GOOD_RECORDS) - fastq_validator.validate_fastq_file(test_file, Lock()) + fastq_validator.validate_fastq_file(test_file) assert "Bad gzip file" in fastq_validator.errors[0] def test_fastq_validator_unrecognized_file(self, fastq_validator, @@ -59,7 +59,7 @@ def test_fastq_validator_unrecognized_file(self, fastq_validator, with _open_output_file(test_file, False) as output: output.write(_GOOD_RECORDS) - fastq_validator.validate_fastq_file(test_file, Lock()) + fastq_validator.validate_fastq_file(test_file) assert "Filename does not have proper format" in \ fastq_validator.errors[0] @@ -105,7 +105,7 @@ def test_fastq_validator_duplicate_file(self, fastq_validator, tmp_path): def test_fastq_validator_io_error(self, fastq_validator, tmp_path): fake_path = tmp_path.joinpath('does-not-exist.fastq') - fastq_validator.validate_fastq_file(fake_path, Lock()) + fastq_validator.validate_fastq_file(fake_path) assert "Unable to open" in fastq_validator.errors[0] @@ -178,7 +178,7 @@ def test_fastq_validator_line_4_mismatched_length(self, fastq_validator, with _open_output_file(new_file, False) as output: output.write(test_data) - fastq_validator.validate_fastq_file(new_file, Lock()) + fastq_validator.validate_fastq_file(new_file) assert "contains 9 characters which does not match line 2's 10" in \ fastq_validator.errors[0] From fb7d110a4e6069d8b8c629a25c6fde43af7f56a3 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Mon, 29 Jan 2024 16:31:38 -0500 Subject: [PATCH 15/30] slightly clearer error messaging --- src/ingest_validation_tests/fastq_validator_logic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index 6784a8b..16403e4 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -213,7 +213,7 @@ def _find_duplicates(self, dirs_and_files): for filename, filepaths in files_per_path.items(): if len(filepaths) > 1: self.errors.append(_log( - f"{filename} has been found multiple times during this validation of path {data_path}. Duplicates: {filepaths}.")) # noqa: E501 + f"{filename} has been found multiple times during this validation. Locations of duplicates: {filepaths}.")) # noqa: E501 def _find_shared_prefixes(self, lock): # This pattern seeks out the string that includes the lane number (since From ae28b42a8b8378ff87fc37b0b020be03c06aa295 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Tue, 30 Jan 2024 12:55:20 -0500 Subject: [PATCH 16/30] updating .travis.yml with newest IVT release --- .travis.yml | 8 ++++---- src/ingest_validation_tests/fastq_validator_logic.py | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index adf9a70..beeedc9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,20 +1,20 @@ language: python python: - - '3.9' + - "3.9" cache: pip install: - pip install -r requirements.txt # - pip install -r requirements-dev.txt - + - pwd - cd .. - git clone https://github.com/hubmapconsortium/ingest-validation-tools.git - cd ingest-validation-tools - pwd - - git checkout v0.0.15 + - git checkout v0.0.17 # - pip install "setuptools==60.9.0" - pip install -r requirements.txt - + - cd ../ingest-validation-tests # Not sure if this is required, or if it resets between sections. script: - ./test.sh diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index 16403e4..85eeb44 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -180,6 +180,7 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: _log(f"Added files from {path} to dirs_and_files: {dirs_and_files}") file_list = [] with Manager() as manager: + # TODO: re-evaluate dicts/for loops lock = manager.Lock() for path, rel_paths in dirs_and_files.items(): for rel_path, files in rel_paths.items(): @@ -203,6 +204,7 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: self.errors.extend(data_found_one) def _find_duplicates(self, dirs_and_files): + # TODO: re-evaluate dicts/for loops for data_path, sub_dirs in dirs_and_files.items(): # Creates a dict of filenames to list of full filepaths for each # fastq file in a given data_path (dataset dir). From f69bf7db12ead2fba59ce200b85c92dbfba4530d Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Tue, 23 Jan 2024 14:22:10 -0500 Subject: [PATCH 17/30] adding linting and formatting tools --- .flake8 | 3 +++ pyproject.toml | 7 +++++++ requirements-dev.txt | 10 ++++++++++ 3 files changed, 20 insertions(+) create mode 100644 .flake8 create mode 100644 pyproject.toml create mode 100644 requirements-dev.txt diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..ad98539 --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 100 +extend-ignore = E203 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..72dd8d9 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,7 @@ +[tool.black] +line-length = 99 + +[tool.isort] +profile = "black" +multi_line_output = 3 +src_paths = ["src/ingest-validation-tests"] diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..d5c087a --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,10 @@ +black==23.12.1 +flake8==7.0.0 +git+https://github.com/hubmapconsortium/fastq-utils.git@v0.2.5#egg=hubmap-fastq-utils +imagecodecs>=2023.3.16 +isort==5.13.2 +jsonschema==4.4.0 +pandas>=1.2.0 +python-frontmatter>=1.0.0 +tifffile==2020.10.1 +xmlschema>=1.6 From 930fd015a671cdadd51520c8024d39ab8b4fe71d Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 11:35:10 -0500 Subject: [PATCH 18/30] ran black / flake8 / isort, fixed errors, updated .gitignore --- .gitignore | 1 + .../codex_common_errors_validator.py | 104 +++++++--------- .../codex_json_validator.py | 8 +- .../fastq_validator.py | 6 +- .../fastq_validator_logic.py | 107 ++++++++-------- src/ingest_validation_tests/gz_validator.py | 4 +- .../publication_validator.py | 63 ++++++---- src/ingest_validation_tests/tiff_validator.py | 12 +- tests/pytest_runner.py | 18 +-- tests/test_codex_common_errors_validator.py | 107 ++++++++++------ tests/test_codex_json_validator.py | 25 ++-- tests/test_fastq_validator_logic.py | 116 ++++++++---------- tests/test_gz_validator.py | 19 +-- tests/test_ome_tiff_validator.py | 23 ++-- tests/test_publication_validator.py | 56 +++++---- tests/test_tiff_validator.py | 32 +++-- 16 files changed, 381 insertions(+), 320 deletions(-) diff --git a/.gitignore b/.gitignore index 916db7b..da9fde3 100644 --- a/.gitignore +++ b/.gitignore @@ -105,6 +105,7 @@ celerybeat.pid # Environments .env +.envrc .venv env/ venv/ diff --git a/src/ingest_validation_tests/codex_common_errors_validator.py b/src/ingest_validation_tests/codex_common_errors_validator.py index 391e9cc..5edbec8 100644 --- a/src/ingest_validation_tests/codex_common_errors_validator.py +++ b/src/ingest_validation_tests/codex_common_errors_validator.py @@ -20,22 +20,20 @@ def _split_cycle_dir_string(cycle_str): """ Given a cycle-and-region directory name, split out the cyle and region numbers """ - words = cycle_str.split('_') + words = cycle_str.split("_") assert len(words) >= 2, f'Directory string "{cycle_str}" has unexpected form' - assert words[0].startswith('cyc'), (f'directory string "{cycle_str}" does' - ' not start with "cyc"') + assert words[0].startswith("cyc"), ( + f'directory string "{cycle_str}" does' ' not start with "cyc"' + ) try: - cyc_id = int(words[0][len('cyc'):]) + cyc_id = int(words[0][len("cyc") :]) except ValueError: - raise AssertionError(f'Directory string "{cycle_str}" cycle number is' - ' not an integer') - assert words[1].startswith('reg'), (f'Directory string "{cycle_str}" does' - ' not include "_reg"') + raise AssertionError(f'Directory string "{cycle_str}" cycle number is' " not an integer") + assert words[1].startswith("reg"), f'Directory string "{cycle_str}" does' ' not include "_reg"' try: - reg_id = int(words[1][len('reg'):]) + reg_id = int(words[1][len("reg") :]) except ValueError: - raise AssertionError(f'Directory string "{cycle_str}" region number is' - ' not an integer') + raise AssertionError(f'Directory string "{cycle_str}" region number is' " not an integer") return cyc_id, reg_id @@ -53,7 +51,7 @@ def collect_errors(self, **kwargs) -> List[str]: Return the errors found by this validator """ del kwargs - if self.assay_type != 'CODEX': + if self.assay_type != "CODEX": return [] # We only test CODEX data rslts = [] for path in self.paths: @@ -61,101 +59,95 @@ def collect_errors(self, **kwargs) -> List[str]: try: # is the raw/src_ directory present? prefix = None - if (path / 'raw').is_dir(): - prefix = path / 'raw' + if (path / "raw").is_dir(): + prefix = path / "raw" else: - for candidate in path.glob('src_*'): + for candidate in path.glob("src_*"): prefix = candidate if prefix is None: - rslt.append('The raw/src_ subdirectory is missing?') + rslt.append("The raw/src_ subdirectory is missing?") raise QuitNowException() # Does dataset.json exist? If so, 'new CODEX' syntax rules # are in effect dataset_json_exists = False any_dataset_json_exists = False - for candidate in path.glob('**/dataset.json'): + for candidate in path.glob("**/dataset.json"): any_dataset_json_exists = True - if candidate == prefix / 'dataset.json': + if candidate == prefix / "dataset.json": dataset_json_exists = True if dataset_json_exists: - print('FOUND dataset.json; skipping further analysis') + print("FOUND dataset.json; skipping further analysis") raise QuitNowException() elif any_dataset_json_exists: - rslt.append( - 'A dataset.json file exists but' - ' is in the wrong place' - ) + rslt.append("A dataset.json file exists but" " is in the wrong place") # is the segmentation.json file on the right side? found = False right_place = False - for filepath in path.glob('*/[Ss]egmentation.json'): + for filepath in path.glob("*/[Ss]egmentation.json"): rel_path = filepath.relative_to(path) found = True - if str(rel_path).startswith(('raw', 'src_')): + if str(rel_path).startswith(("raw", "src_")): right_place = True if found: if right_place: pass else: - rslt.append( - 'The segmentation.json file is in the wrong subdirectory' - ) + rslt.append("The segmentation.json file is in the wrong subdirectory") else: - rslt.append('The segmentation.json file is missing or misplaced') + rslt.append("The segmentation.json file is missing or misplaced") # Does the channelnames.txt file exist? - channelnames_txt_path = prefix / 'channelnames.txt' + channelnames_txt_path = prefix / "channelnames.txt" if not channelnames_txt_path.is_file(): # sometimes we see this variant - channelnames_txt_path = prefix / 'channelNames.txt' + channelnames_txt_path = prefix / "channelNames.txt" if not channelnames_txt_path.is_file(): - rslt.append('channelnames.txt is missing') + rslt.append("channelnames.txt is missing") raise QuitNowException() # Parse channelnames.txt into a dataframe try: cn_df = pd.read_csv(str(channelnames_txt_path), header=None) except Exception: - rslt.append(f'Unexpected error reading {channelnames_txt_path}') + rslt.append(f"Unexpected error reading {channelnames_txt_path}") raise QuitNowException() if len(cn_df.columns) != 1: - rslt.append(f'Unexpected format for {channelnames_txt_path}') + rslt.append(f"Unexpected format for {channelnames_txt_path}") raise QuitNowException() # Does the channelnames_report.csv file exist? - report_csv_path = prefix / 'channelnames_report.csv' + report_csv_path = prefix / "channelnames_report.csv" if report_csv_path.is_file(): # Parse channelnames_report.txt into a dataframe try: - rpt_df = pd.read_csv(str(report_csv_path), sep=',', header=None) + rpt_df = pd.read_csv(str(report_csv_path), sep=",", header=None) except Exception: - rslt.append(f'Unexpected error reading {report_csv_path}') + rslt.append(f"Unexpected error reading {report_csv_path}") raise QuitNowException() if len(rpt_df) == len(cn_df) + 1: # channelnames_report.csv appears to have a header try: - rpt_df = pd.read_csv(str(report_csv_path), sep=',') + rpt_df = pd.read_csv(str(report_csv_path), sep=",") except Exception: - rslt.append(f'Unexpected error reading {report_csv_path}') + rslt.append(f"Unexpected error reading {report_csv_path}") raise QuitNowException() if len(rpt_df.columns) != 2: rslt.append( - f'Could not parse {report_csv_path}.' - ' Is it a comma-separated table?' + f"Could not parse {report_csv_path}." " Is it a comma-separated table?" ) raise QuitNowException() col_0, col_1 = rpt_df.columns - rpt_df = rpt_df.rename(columns={col_0: 'Marker', col_1: 'Result'}) + rpt_df = rpt_df.rename(columns={col_0: "Marker", col_1: "Result"}) # Do they match? - rpt_df['other'] = cn_df[0] - mismatches_df = rpt_df[rpt_df['other'] != rpt_df['Marker']] + rpt_df["other"] = cn_df[0] + mismatches_df = rpt_df[rpt_df["other"] != rpt_df["Marker"]] if len(mismatches_df) != 0: for idx, row in mismatches_df.iterrows(): rslt.append( - f'{channelnames_txt_path.name} does not' - ' match channelnames_report.txt' + f"{channelnames_txt_path.name} does not" + " match channelnames_report.txt" f' on line {idx}: {row["other"]} vs {row["Marker"]}' ) raise QuitNowException() @@ -164,7 +156,7 @@ def collect_errors(self, **kwargs) -> List[str]: # Tabulate the cycle and region info all_cycle_dirs = [] - for glob_str in ['cyc*', 'Cyc*']: + for glob_str in ["cyc*", "Cyc*"]: for pth in prefix.glob(glob_str): if pth.is_dir(): all_cycle_dirs.append(str(pth.stem).lower()) @@ -189,28 +181,26 @@ def collect_errors(self, **kwargs) -> List[str]: failures = [] # First cycle must be 1 if cycles[0] != 1: - failures.append('Cycle numbering does not start at 1') + failures.append("Cycle numbering does not start at 1") # First region must be 1 if regions[0] != 1: - failures.append('Region numbering does not start at 1') + failures.append("Region numbering does not start at 1") # Cycle range must be contiguous ints if cycles != list(range(cycles[0], cycles[-1] + 1)): - failures.append('Cycle numbers are not contiguous') + failures.append("Cycle numbers are not contiguous") # Region range must be contiguous ints if regions != list(range(regions[0], regions[-1] + 1)): - failures.append('Region numbers are not contiguous') + failures.append("Region numbers are not contiguous") # All cycle, region pairs must be present if len(cycles) * len(regions) != total_entries: - failures.append('Not all cycle/region pairs are present') + failures.append("Not all cycle/region pairs are present") # Total number of channels / total number of cycles must be integer, # excluding any HandE channels total_channel_count = len(cn_df) - h_and_e_channel_count = len(cn_df[cn_df[0].str.startswith('HandE')]) - channels_per_cycle = ( - total_channel_count - h_and_e_channel_count - ) / len(cycles) + h_and_e_channel_count = len(cn_df[cn_df[0].str.startswith("HandE")]) + channels_per_cycle = (total_channel_count - h_and_e_channel_count) / len(cycles) if channels_per_cycle != int(channels_per_cycle): - failures.append('The number of channels per cycle is not constant') + failures.append("The number of channels per cycle is not constant") if failures: rslt += failures raise QuitNowException() diff --git a/src/ingest_validation_tests/codex_json_validator.py b/src/ingest_validation_tests/codex_json_validator.py index 1e9e9ba..41a6955 100644 --- a/src/ingest_validation_tests/codex_json_validator.py +++ b/src/ingest_validation_tests/codex_json_validator.py @@ -12,19 +12,19 @@ class CodexJsonValidator(Validator): def collect_errors(self, **kwargs) -> List[str]: del kwargs - if 'codex' not in self.assay_type.lower(): + if "codex" not in self.assay_type.lower(): return [] - schema_path = Path(__file__).parent / 'codex_schema.json' + schema_path = Path(__file__).parent / "codex_schema.json" schema = json.loads(schema_path.read_text()) rslt = [] - for glob_expr in ['**/dataset.json']: + for glob_expr in ["**/dataset.json"]: for path in self.paths: for file in path.glob(glob_expr): instance = json.loads(file.read_text()) try: validate(instance=instance, schema=schema) except Exception as e: - rslt.append(f'{file}: {e}') + rslt.append(f"{file}: {e}") return rslt diff --git a/src/ingest_validation_tests/fastq_validator.py b/src/ingest_validation_tests/fastq_validator.py index 0a30e42..d31335f 100644 --- a/src/ingest_validation_tests/fastq_validator.py +++ b/src/ingest_validation_tests/fastq_validator.py @@ -1,8 +1,8 @@ from os import cpu_count from typing import List -from ingest_validation_tools.plugin_validator import Validator from fastq_validator_logic import FASTQValidatorLogic, _log +from ingest_validation_tools.plugin_validator import Validator class FASTQValidator(Validator): @@ -10,8 +10,8 @@ class FASTQValidator(Validator): cost = 15.0 def collect_errors(self, **kwargs) -> List[str]: - threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1 - _log(f'Threading at {threads}') + threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1 + _log(f"Threading at {threads}") validator = FASTQValidatorLogic(verbose=True) validator.validate_fastq_files_in_path(self.paths, threads) return validator.errors diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index 85eeb44..8095b74 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -15,10 +15,7 @@ def is_valid_filename(filename: str) -> bool: def _open_fastq_file(file: Path) -> TextIO: - return ( - gzip.open(file, 'rt') if file.name.endswith('.gz') - else file.open() - ) + return gzip.open(file, "rt") if file.name.endswith(".gz") else file.open() def _log(message: str) -> str: @@ -54,13 +51,13 @@ class FASTQValidatorLogic: contain the same number of symbols as letters in the sequence. """ - _FASTQ_LINE_2_VALID_CHARS = 'ACGNT' + _FASTQ_LINE_2_VALID_CHARS = "ACGNT" def __init__(self, verbose=False): self.errors: List[str] = [] self._file_record_counts = Manager().dict() self._file_prefix_counts = Manager().dict() - self._filename = '' + self._filename = "" self._line_number = 0 self._verbose = verbose @@ -80,7 +77,7 @@ def _format_error(self, error: str) -> str: return message def _validate_fastq_line_1(self, line: str) -> List[str]: - if not line or line[0] != '@': + if not line or line[0] != "@": return ["Line does not begin with '@'."] return [] @@ -89,45 +86,47 @@ def _validate_fastq_line_2(self, line: str) -> List[str]: self._line_2_length = len(line) self._last_line_2_number = self._line_number - invalid_chars = ''.join( - c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS) + invalid_chars = "".join(c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS) if invalid_chars: return [f"Line contains invalid character(s): {invalid_chars}"] return [] def _validate_fastq_line_3(self, line: str) -> List[str]: - if not line or line[0] != '+': + if not line or line[0] != "+": return ["Line does not begin with '+'."] return [] def _validate_fastq_line_4(self, line: str) -> List[str]: errors: List[str] = [] - invalid_chars = ''.join(c for c in line if not 33 <= ord(c) <= 126) + invalid_chars = "".join(c for c in line if not 33 <= ord(c) <= 126) if invalid_chars: - errors.append("Line contains invalid quality character(s): " - f'"{invalid_chars}"') + errors.append("Line contains invalid quality character(s): " f'"{invalid_chars}"') if len(line) != self._line_2_length: - errors.append(f"Line contains {len(line)} characters which " - f"does not match line {self._last_line_2_number}'s " - f"{self._line_2_length} characters.") + errors.append( + f"Line contains {len(line)} characters which " + f"does not match line {self._last_line_2_number}'s " + f"{self._line_2_length} characters." + ) return errors - _VALIDATE_FASTQ_LINE_METHODS = {1: _validate_fastq_line_1, - 2: _validate_fastq_line_2, - 3: _validate_fastq_line_3, - 4: _validate_fastq_line_4} + _VALIDATE_FASTQ_LINE_METHODS = { + 1: _validate_fastq_line_1, + 2: _validate_fastq_line_2, + 3: _validate_fastq_line_3, + 4: _validate_fastq_line_4, + } def validate_fastq_record(self, line: str, line_number: int) -> List[str]: line_index = line_number % 4 + 1 - validator_method: Callable[[FASTQValidatorLogic, str], List[str]] = \ - self._VALIDATE_FASTQ_LINE_METHODS[line_index] + validator_method: Callable[ + [FASTQValidatorLogic, str], List[str] + ] = self._VALIDATE_FASTQ_LINE_METHODS[line_index] - assert validator_method, \ - f"No validator method defined for record index {line_index}" + assert validator_method, f"No validator method defined for record index {line_index}" return validator_method(self, line) @@ -139,8 +138,8 @@ def validate_fastq_stream(self, fastq_data: TextIO) -> int: for line_count, line in enumerate(fastq_data): self._line_number = line_count + 1 self.errors.extend( - self._format_error(error) for error in - self.validate_fastq_record(line.rstrip(), line_count) + self._format_error(error) + for error in self.validate_fastq_record(line.rstrip(), line_count) ) return line_count + 1 @@ -151,9 +150,9 @@ def validate_fastq_file(self, fastq_file: Path) -> None: if not is_valid_filename(fastq_file.name): # If we don't like the filename, don't bother reading the contents. - self.errors.append(_log( - "Filename does not have proper format " - "and will not be processed")) + self.errors.append( + _log("Filename does not have proper format " "and will not be processed") + ) return self._line_number = 0 @@ -163,12 +162,10 @@ def validate_fastq_file(self, fastq_file: Path) -> None: with _open_fastq_file(fastq_file) as fastq_data: records_read = self.validate_fastq_stream(fastq_data) except gzip.BadGzipFile: - self.errors.append( - self._format_error(f"Bad gzip file: {fastq_file}.")) + self.errors.append(self._format_error(f"Bad gzip file: {fastq_file}.")) return except IOError: - self.errors.append( - self._format_error(f"Unable to open FASTQ data file {fastq_file}.")) + self.errors.append(self._format_error(f"Unable to open FASTQ data file {fastq_file}.")) return self._file_record_counts[str(fastq_file)] = records_read @@ -187,7 +184,9 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: for file in files: file_list.append(Path(path / rel_path / file)) try: - logging.info(f"Passing file list for paths {paths} to engine. File list: {file_list}.") + logging.info( + f"Passing file list for paths {paths} to engine. File list: {file_list}." + ) pool = Pool(threads) engine = Engine(self) data_output = pool.imap_unordered(engine, file_list) @@ -214,14 +213,17 @@ def _find_duplicates(self, dirs_and_files): files_per_path[filepath.name].append(data_path / sub_path) for filename, filepaths in files_per_path.items(): if len(filepaths) > 1: - self.errors.append(_log( - f"{filename} has been found multiple times during this validation. Locations of duplicates: {filepaths}.")) # noqa: E501 + self.errors.append( + _log( + f"{filename} has been found multiple times during this validation. Locations of duplicates: {filepaths}." # noqa: E501 + ) + ) def _find_shared_prefixes(self, lock): # This pattern seeks out the string that includes the lane number (since # that is expected to be present to help anchor the prefix) that comes # before any of _I1, _I2, _R1, or _R2. - fastq_file_prefix_regex = re.compile(r'(.+_L\d+.*)_[IR][12][._]') + fastq_file_prefix_regex = re.compile(r"(.+_L\d+.*)_[IR][12][._]") for fastq_file, records_read in self._file_record_counts.items(): match = fastq_file_prefix_regex.match(Path(fastq_file).name) with lock: @@ -233,26 +235,31 @@ def _find_shared_prefixes(self, lock): # Find a file we've validated already that matches this # prefix. extant_files = [ - str(Path(filepath).name) for filepath, record_count - in self._file_record_counts.items() - if record_count == extant_count and Path(filepath).name.startswith(filename_prefix) + str(Path(filepath).name) + for filepath, record_count in self._file_record_counts.items() + if record_count == extant_count + and Path(filepath).name.startswith(filename_prefix) ] # Based on how the dictionaries are created, there should # always be at least one matching filename. assert extant_files - self.errors.append(_log( - f"{Path(fastq_file).name} ({records_read} lines) " - f"does not match length of {extant_files[0]} " - f"({extant_count} lines).")) + self.errors.append( + _log( + f"{Path(fastq_file).name} ({records_read} lines) " + f"does not match length of {extant_files[0]} " + f"({extant_count} lines)." + ) + ) else: self._file_prefix_counts[filename_prefix] = records_read def main(): - parser = argparse.ArgumentParser(description='Validate FASTQ files.') - parser.add_argument('filepaths', type=Path, nargs='+', - help="Files to validate for FASTQ syntax") + parser = argparse.ArgumentParser(description="Validate FASTQ files.") + parser.add_argument( + "filepaths", type=Path, nargs="+", help="Files to validate for FASTQ syntax" + ) args = parser.parse_args() if isinstance(args.filepaths, List): @@ -262,13 +269,11 @@ def main(): elif isinstance(args.filepaths, str): filepaths = [Path(args.filepaths)] else: - raise Exception( - f"Validator init received base_paths arg as type {type(args.filepaths)}" - ) + raise Exception(f"Validator init received base_paths arg as type {type(args.filepaths)}") validator = FASTQValidatorLogic(True) validator.validate_fastq_files_in_path(filepaths, Lock()) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/ingest_validation_tests/gz_validator.py b/src/ingest_validation_tests/gz_validator.py index 58ad671..9bb0162 100644 --- a/src/ingest_validation_tests/gz_validator.py +++ b/src/ingest_validation_tests/gz_validator.py @@ -1,9 +1,9 @@ +import gzip +import re from multiprocessing import Pool from os import cpu_count -import re from typing import List -import gzip from ingest_validation_tools.plugin_validator import Validator diff --git a/src/ingest_validation_tests/publication_validator.py b/src/ingest_validation_tests/publication_validator.py index 4c7ccc1..528a2b5 100644 --- a/src/ingest_validation_tests/publication_validator.py +++ b/src/ingest_validation_tests/publication_validator.py @@ -2,10 +2,11 @@ Test for some common errors in the directory and file structure of publications. """ -from typing import List -import re import json +import re from pathlib import Path +from typing import List + import frontmatter from ingest_validation_tools.plugin_validator import Validator @@ -15,50 +16,57 @@ class PublicationValidator(Validator): Test for some common errors in the directory and file structure of publications. """ + description = "Test for common problems found in publications" cost = 1.0 - base_url_re = r'(\s*\{\{\s*base_url\s*\}\})/(.*)' - url_re = r'[Uu][Rr][Ll]' + base_url_re = r"(\s*\{\{\s*base_url\s*\}\})/(.*)" + url_re = r"[Uu][Rr][Ll]" def collect_errors(self, **kwargs) -> List[str]: """ Return the errors found by this validator """ del kwargs - if self.assay_type != 'Publication': + if self.assay_type != "Publication": return [] # We only test Publication data rslt = [] for path in self.paths: try: - vignette_path = path / 'vignettes' - assert vignette_path.is_dir(), 'vignettes not found or not a directory' - for this_vignette_path in vignette_path.glob('*'): - assert this_vignette_path.is_dir(), (f"Found the non-dir {this_vignette_path}" - " in vignettes") - this_vignette_all_paths = set(this_vignette_path.glob('*')) + vignette_path = path / "vignettes" + assert vignette_path.is_dir(), "vignettes not found or not a directory" + for this_vignette_path in vignette_path.glob("*"): + assert this_vignette_path.is_dir(), ( + f"Found the non-dir {this_vignette_path}" " in vignettes" + ) + this_vignette_all_paths = set(this_vignette_path.glob("*")) if not all(pth.is_file() for pth in this_vignette_all_paths): - raise AssertionError('Found a subdirectory in a vignette') + raise AssertionError("Found a subdirectory in a vignette") md_found = False vig_figures = [] - for md_path in this_vignette_path.glob('*.md'): + for md_path in this_vignette_path.glob("*.md"): if md_found: - raise AssertionError('A vignette has more than one markdown file') + raise AssertionError("A vignette has more than one markdown file") else: md_found = True vig_fm = frontmatter.loads(md_path.read_text()) - for key in ['name', 'figures']: - assert key in vig_fm.metadata, ('vignette markdown is incorrectly' - f' formatted or has no {key}') - for fig_dict in vig_fm.metadata['figures']: - assert 'file' in fig_dict, 'figure dict does not reference a file' - assert 'name' in fig_dict, 'figure dict does not provide a name' - vig_figures.append(fig_dict['file']) + for key in ["name", "figures"]: + assert key in vig_fm.metadata, ( + "vignette markdown is incorrectly" f" formatted or has no {key}" + ) + for fig_dict in vig_fm.metadata["figures"]: + assert "file" in fig_dict, "figure dict does not reference a file" + assert "name" in fig_dict, "figure dict does not provide a name" + vig_figures.append(fig_dict["file"]) this_vignette_all_paths.remove(md_path) for fname in vig_figures: - rslt.extend(self.validate_vitessce_config(this_vignette_path / fname, path)) + rslt.extend( + self.validate_vitessce_config(this_vignette_path / fname, path) + ) this_vignette_all_paths.remove(this_vignette_path / fname) - assert not this_vignette_all_paths, ('unexpected files in vignette:' - f' {list(str(elt) for elt in this_vignette_all_paths)}') + assert not this_vignette_all_paths, ( + "unexpected files in vignette:" + f" {list(str(elt) for elt in this_vignette_all_paths)}" + ) except AssertionError as excp: rslt.append(str(excp)) @@ -94,9 +102,10 @@ def validate_vitessce_config(self, json_path, path): match = re.match(self.base_url_re, val) if match: # it starts with {{ base_url }} extra_url = match.group(2) - data_path = path / 'data' / extra_url - assert data_path.exists(), ("expected data file" - f" {Path('data') / extra_url} is absent") + data_path = path / "data" / extra_url + assert data_path.exists(), ( + "expected data file" f" {Path('data') / extra_url} is absent" + ) except AssertionError as excp: rslt.append(str(excp)) diff --git a/src/ingest_validation_tests/tiff_validator.py b/src/ingest_validation_tests/tiff_validator.py index db840c1..23b568a 100644 --- a/src/ingest_validation_tests/tiff_validator.py +++ b/src/ingest_validation_tests/tiff_validator.py @@ -37,13 +37,15 @@ class TiffValidator(Validator): cost = 1.0 def collect_errors(self, **kwargs) -> List[str]: - threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1 + threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1 pool = Pool(threads) filenames_to_test = [] - for glob_expr in ['**/*.tif', '**/*.tiff', '**/*.TIFF', '**/*.TIF']: + for glob_expr in ["**/*.tif", "**/*.tiff", "**/*.TIFF", "**/*.TIF"]: for path in self.paths: for file in path.glob(glob_expr): filenames_to_test.append(file) - return list(rslt for rslt in pool.imap_unordered(_check_tiff_file, - filenames_to_test) - if rslt is not None) + return list( + rslt + for rslt in pool.imap_unordered(_check_tiff_file, filenames_to_test) + if rslt is not None + ) diff --git a/tests/pytest_runner.py b/tests/pytest_runner.py index 0743dad..ac7e5a3 100644 --- a/tests/pytest_runner.py +++ b/tests/pytest_runner.py @@ -1,12 +1,15 @@ import sys from pathlib import Path + import pytest -class add_path(): + +class add_path: """ Add an element to sys.path using a context. Thanks to Eugene Yarmash https://stackoverflow.com/a/39855753 """ + def __init__(self, path): self.path = path @@ -22,16 +25,13 @@ def __exit__(self, exc_type, exc_value, traceback): def main(): if len(sys.argv) != 2: - sys.exit(f'usage: {sys.argv[0]} path-to-ingest-validation-tools') - tools_path = Path(sys.argv[1]).resolve() / 'src' - plugins_path = (Path(__file__).resolve().parent.parent - / 'src' - / 'ingest_validation_tests' - ) + sys.exit(f"usage: {sys.argv[0]} path-to-ingest-validation-tools") + tools_path = Path(sys.argv[1]).resolve() / "src" + plugins_path = Path(__file__).resolve().parent.parent / "src" / "ingest_validation_tests" with add_path(str(tools_path)): with add_path(str(plugins_path)): - sys.exit(pytest.main(['-vv'])) + sys.exit(pytest.main(["-vv"])) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tests/test_codex_common_errors_validator.py b/tests/test_codex_common_errors_validator.py index 23cd5e7..68d1d82 100644 --- a/tests/test_codex_common_errors_validator.py +++ b/tests/test_codex_common_errors_validator.py @@ -1,53 +1,84 @@ -from pathlib import Path import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_starts_list'), ( - ('test_data/fake_codex_tree_0.zip', ['Unexpected error reading']), - ('test_data/fake_codex_tree_1.zip', ['The segmentation.json file is in', - 'Unexpected error reading']), - ('test_data/fake_codex_tree_2.zip', ['The raw/src_ subdirectory is missing?']), - ('test_data/fake_codex_tree_3.zip', ['channelnames.txt is missing']), - ('test_data/fake_codex_tree_4.zip', ['Unexpected error reading']), - ('test_data/fake_codex_tree_5.zip', ['channelnames.txt does not match channelnames_report.txt' - ' on line 1: HLADR vs HLA-DR', - 'channelnames.txt does not match channelnames_report.txt' - ' on line 6: Empty vs Blank']), - ('test_data/fake_codex_tree_6.zip', ['Could not parse ']), - ('test_data/fake_codex_tree_7.zip', []), - ('test_data/fake_codex_tree_8.zip', ['Region numbers are not contiguous']), - ('test_data/fake_codex_tree_9.zip', ['Cycle numbers are not contiguous', - 'The number of channels per cycle is not constant']), - ('test_data/fake_codex_tree_10.zip', ['Directory string "cyc0a3_reg001_211119_040351"' - ' cycle number is not an integer']), - ('test_data/fake_codex_tree_11.zip', ['Directory string "cyc003_reg0a1_211119_040351"' - ' region number is not an integer']), - ('test_data/fake_codex_tree_12.zip', ['Directory string "cyc002_rig001_211119_040351"' - ' does not include "_reg"']), - ('test_data/fake_codex_tree_13.zip', ['Cycle numbering does not start at 1']), - ('test_data/fake_codex_tree_14.zip', ['Region numbering does not start at 1']), - ('test_data/fake_codex_tree_15.zip', ['Not all cycle/region pairs are present', - 'The number of channels per cycle is not constant']), - ('test_data/fake_codex_tree_16.zip', []), - ('test_data/fake_codex_tree_17.zip', ['A dataset.json file exists but is in the wrong place', - 'Region numbering does not start at 1']), - ('test_data/fake_codex_tree_18.zip', ['The number of channels per cycle is not constant']), - ('test_data/fake_codex_tree_19.zip', []), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_starts_list"), + ( + ("test_data/fake_codex_tree_0.zip", ["Unexpected error reading"]), + ( + "test_data/fake_codex_tree_1.zip", + ["The segmentation.json file is in", "Unexpected error reading"], + ), + ("test_data/fake_codex_tree_2.zip", ["The raw/src_ subdirectory is missing?"]), + ("test_data/fake_codex_tree_3.zip", ["channelnames.txt is missing"]), + ("test_data/fake_codex_tree_4.zip", ["Unexpected error reading"]), + ( + "test_data/fake_codex_tree_5.zip", + [ + "channelnames.txt does not match channelnames_report.txt" + " on line 1: HLADR vs HLA-DR", + "channelnames.txt does not match channelnames_report.txt" + " on line 6: Empty vs Blank", + ], + ), + ("test_data/fake_codex_tree_6.zip", ["Could not parse "]), + ("test_data/fake_codex_tree_7.zip", []), + ("test_data/fake_codex_tree_8.zip", ["Region numbers are not contiguous"]), + ( + "test_data/fake_codex_tree_9.zip", + [ + "Cycle numbers are not contiguous", + "The number of channels per cycle is not constant", + ], + ), + ( + "test_data/fake_codex_tree_10.zip", + ['Directory string "cyc0a3_reg001_211119_040351"' " cycle number is not an integer"], + ), + ( + "test_data/fake_codex_tree_11.zip", + ['Directory string "cyc003_reg0a1_211119_040351"' " region number is not an integer"], + ), + ( + "test_data/fake_codex_tree_12.zip", + ['Directory string "cyc002_rig001_211119_040351"' ' does not include "_reg"'], + ), + ("test_data/fake_codex_tree_13.zip", ["Cycle numbering does not start at 1"]), + ("test_data/fake_codex_tree_14.zip", ["Region numbering does not start at 1"]), + ( + "test_data/fake_codex_tree_15.zip", + [ + "Not all cycle/region pairs are present", + "The number of channels per cycle is not constant", + ], + ), + ("test_data/fake_codex_tree_16.zip", []), + ( + "test_data/fake_codex_tree_17.zip", + [ + "A dataset.json file exists but is in the wrong place", + "Region numbering does not start at 1", + ], + ), + ("test_data/fake_codex_tree_18.zip", ["The number of channels per cycle is not constant"]), + ("test_data/fake_codex_tree_19.zip", []), + ), +) def test_codex_common_errors_validator(test_data_fname, msg_starts_list, tmp_path): from codex_common_errors_validator import CodexCommonErrorsValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = CodexCommonErrorsValidator([Path(tmp_path / test_data_path.stem)], - 'CODEX' - ) + validator = CodexCommonErrorsValidator([Path(tmp_path / test_data_path.stem)], "CODEX") errors = validator.collect_errors()[:] - print(f'ERRORS FOLLOW FOR {test_data_fname}') + print(f"ERRORS FOLLOW FOR {test_data_fname}") for err in errors: print(err) - print('ERRORS ABOVE') + print("ERRORS ABOVE") assert len(msg_starts_list) == len(errors) for err_str, expected_str in zip(errors, msg_starts_list): assert err_str.startswith(expected_str) diff --git a/tests/test_codex_json_validator.py b/tests/test_codex_json_validator.py index a2998ff..4539270 100644 --- a/tests/test_codex_json_validator.py +++ b/tests/test_codex_json_validator.py @@ -1,22 +1,27 @@ -from pathlib import Path -import zipfile import re +import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), ( - ('test_data/good_codex_akoya_directory_v1_with_dataset_json_fails.zip', - [".*is not of type 'object'.*"]), - ('test_data/good_codex_akoya_directory_v1_with_dataset_json_passes.zip', []), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_re_list"), + ( + ( + "test_data/good_codex_akoya_directory_v1_with_dataset_json_fails.zip", + [".*is not of type 'object'.*"], + ), + ("test_data/good_codex_akoya_directory_v1_with_dataset_json_passes.zip", []), + ), +) def test_codex_json_validator(test_data_fname, msg_re_list, tmp_path): from codex_json_validator import CodexJsonValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = CodexJsonValidator(tmp_path / test_data_path.stem, - 'CODEX' - ) + validator = CodexJsonValidator(tmp_path / test_data_path.stem, "CODEX") errors = validator.collect_errors()[:] assert len(msg_re_list) == len(errors) for err_str, expected_re in zip(errors, msg_re_list): diff --git a/tests/test_fastq_validator_logic.py b/tests/test_fastq_validator_logic.py index a244cd8..c554d6f 100644 --- a/tests/test_fastq_validator_logic.py +++ b/tests/test_fastq_validator_logic.py @@ -1,36 +1,30 @@ -from multiprocessing import Lock +import gzip from pathlib import Path from typing import TextIO -import gzip import pytest +from src.ingest_validation_tests.fastq_validator_logic import FASTQValidatorLogic -from src.ingest_validation_tests.fastq_validator_logic import \ - FASTQValidatorLogic - -_GOOD_RECORDS = '''\ +_GOOD_RECORDS = """\ @A12345:123:A12BCDEFG:1:1234:1000:1234 1:N:0:NACTGACTGA+CTGACTGACT NACTGACTGA + #FFFFFFFFF -''' +""" _GOOD_QUALITY_RECORD = ( - '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ' - r'[\]^_`abcdefghijklmnopqrstuvwxyz{|}~' + "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ" + r"[\]^_`abcdefghijklmnopqrstuvwxyz{|}~" ) _GOOD_SEQUENCE_FOR_QUALITY = ( - 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' - 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" ) def _open_output_file(filename: Path, use_gzip: bool) -> TextIO: - return ( - gzip.open(filename, 'wt') if use_gzip - else open(filename, 'wt') - ) + return gzip.open(filename, "wt") if use_gzip else open(filename, "wt") class TestFASTQValidatorLogic: @@ -46,33 +40,29 @@ def test_fastq_validator_no_files(self, fastq_validator, tmp_path): def test_fastq_validator_bad_gzip_data(self, fastq_validator, tmp_path): # Note that the filename ends in .gz, although it will not contain # compressed data. - test_file = tmp_path.joinpath('test.fastq.gz') + test_file = tmp_path.joinpath("test.fastq.gz") with _open_output_file(test_file, False) as output: output.write(_GOOD_RECORDS) fastq_validator.validate_fastq_file(test_file) assert "Bad gzip file" in fastq_validator.errors[0] - def test_fastq_validator_unrecognized_file(self, fastq_validator, - tmp_path): - test_file = tmp_path.joinpath('test.txt') + def test_fastq_validator_unrecognized_file(self, fastq_validator, tmp_path): + test_file = tmp_path.joinpath("test.txt") with _open_output_file(test_file, False) as output: output.write(_GOOD_RECORDS) fastq_validator.validate_fastq_file(test_file) - assert "Filename does not have proper format" in \ - fastq_validator.errors[0] + assert "Filename does not have proper format" in fastq_validator.errors[0] - def test_fastq_validator_empty_directory(self, fastq_validator, - tmp_path): + def test_fastq_validator_empty_directory(self, fastq_validator, tmp_path): fastq_validator.validate_fastq_files_in_path([tmp_path], 2) # No files in path means no errors assert fastq_validator.errors == [] @pytest.mark.parametrize("use_gzip", [False, True]) def test_fastq_validator_basic(self, fastq_validator, tmp_path, use_gzip): - test_file = tmp_path.joinpath('test.fastq.gz' if use_gzip - else 'test.fastq') + test_file = tmp_path.joinpath("test.fastq.gz" if use_gzip else "test.fastq") with _open_output_file(test_file, use_gzip) as output: output.write(_GOOD_RECORDS) @@ -80,9 +70,9 @@ def test_fastq_validator_basic(self, fastq_validator, tmp_path, use_gzip): assert not fastq_validator.errors def test_fastq_validator_bad_file(self, fastq_validator, tmp_path): - test_file = tmp_path.joinpath('test.fastq') + test_file = tmp_path.joinpath("test.fastq") with _open_output_file(test_file, False) as output: - output.write('ABCDEF') + output.write("ABCDEF") fastq_validator.validate_fastq_files_in_path([tmp_path], 2) @@ -91,56 +81,54 @@ def test_fastq_validator_bad_file(self, fastq_validator, tmp_path): assert fastq_validator.errors def test_fastq_validator_duplicate_file(self, fastq_validator, tmp_path): - for subdirectory in ['a', 'b']: + for subdirectory in ["a", "b"]: subdirectory_path = tmp_path.joinpath(subdirectory) subdirectory_path.mkdir() - with _open_output_file(subdirectory_path.joinpath('test.fastq'), - False) as output: + with _open_output_file(subdirectory_path.joinpath("test.fastq"), False) as output: output.write(_GOOD_RECORDS) fastq_validator.validate_fastq_files_in_path([tmp_path], 2) - assert "test.fastq has been found multiple times" in \ - fastq_validator.errors[0] + assert "test.fastq has been found multiple times" in fastq_validator.errors[0] def test_fastq_validator_io_error(self, fastq_validator, tmp_path): - fake_path = tmp_path.joinpath('does-not-exist.fastq') + fake_path = tmp_path.joinpath("does-not-exist.fastq") fastq_validator.validate_fastq_file(fake_path) assert "Unable to open" in fastq_validator.errors[0] def test_fastq_validator_line_1_good(self, fastq_validator): - result = fastq_validator.validate_fastq_record('@SEQ_ID', 0) + result = fastq_validator.validate_fastq_record("@SEQ_ID", 0) assert not result def test_fastq_validator_line_1_bad(self, fastq_validator): - result = fastq_validator.validate_fastq_record('*SEQ_ID', 0) + result = fastq_validator.validate_fastq_record("*SEQ_ID", 0) assert "does not begin with '@'" in result[0] def test_fastq_validator_line_1_empty(self, fastq_validator): - result = fastq_validator.validate_fastq_record('', 0) + result = fastq_validator.validate_fastq_record("", 0) assert "does not begin with '@'" in result[0] def test_fastq_validator_line_2_good(self, fastq_validator): - result = fastq_validator.validate_fastq_record('ACTGACTGACTGNNNN', 1) + result = fastq_validator.validate_fastq_record("ACTGACTGACTGNNNN", 1) assert not result def test_fastq_validator_line_2_bad(self, fastq_validator): - result = fastq_validator.validate_fastq_record('ACTGACT$ACTGNNNN', 1) + result = fastq_validator.validate_fastq_record("ACTGACT$ACTGNNNN", 1) assert "contains invalid character(s): $" in result[0] def test_fastq_validator_line_3_good(self, fastq_validator): - result = fastq_validator.validate_fastq_record('+SEQ_ID', 2) + result = fastq_validator.validate_fastq_record("+SEQ_ID", 2) assert not result def test_fastq_validator_line_3_bad(self, fastq_validator): - result = fastq_validator.validate_fastq_record('!SEQ_ID', 2) + result = fastq_validator.validate_fastq_record("!SEQ_ID", 2) assert "does not begin with '+'" in result[0] @@ -151,42 +139,41 @@ def test_fastq_validator_line_4_good(self, fastq_validator): assert not result def test_fastq_validator_line_4_bad(self, fastq_validator): - fastq_validator.validate_fastq_record('1234567', 1) - result = fastq_validator.validate_fastq_record('ABC !@#', 3) + fastq_validator.validate_fastq_record("1234567", 1) + result = fastq_validator.validate_fastq_record("ABC !@#", 3) assert 'contains invalid quality character(s): " "' in result[0] def test_fastq_validator_line_4_matching_length(self, fastq_validator): - fastq_validator.validate_fastq_record('1234567', 1) - result = fastq_validator.validate_fastq_record('ABCDEFG', 3) + fastq_validator.validate_fastq_record("1234567", 1) + result = fastq_validator.validate_fastq_record("ABCDEFG", 3) assert not result - def test_fastq_validator_line_4_mismatched_length(self, fastq_validator, - tmp_path): - fastq_validator.validate_fastq_record('123456789ABCDEF', 1) - fastq_validator.validate_fastq_record('ABC', 3) + def test_fastq_validator_line_4_mismatched_length(self, fastq_validator, tmp_path): + fastq_validator.validate_fastq_record("123456789ABCDEF", 1) + fastq_validator.validate_fastq_record("ABC", 3) - test_data = '''\ + test_data = """\ @A12345:123:A12BCDEFG:1:1234:1000:1234 1:N:0:NACTGACTGA+CTGACTGACT NACTGACTGA + #FFFFFFFF -''' +""" - new_file = tmp_path.joinpath('test.fastq') + new_file = tmp_path.joinpath("test.fastq") with _open_output_file(new_file, False) as output: output.write(test_data) fastq_validator.validate_fastq_file(new_file) - assert "contains 9 characters which does not match line 2's 10" in \ - fastq_validator.errors[0] + assert ( + "contains 9 characters which does not match line 2's 10" in fastq_validator.errors[0] + ) - def test_fastq_validator_record_counts_good(self, fastq_validator, - tmp_path): + def test_fastq_validator_record_counts_good(self, fastq_validator, tmp_path): for filename in [ - 'SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq', - 'SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq' + "SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq", + "SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq", ]: new_file = tmp_path.joinpath(filename) with _open_output_file(new_file, False) as output: @@ -196,15 +183,14 @@ def test_fastq_validator_record_counts_good(self, fastq_validator, assert not fastq_validator.errors - def test_fastq_validator_record_counts_bad(self, fastq_validator, - tmp_path): - with _open_output_file(tmp_path.joinpath( - 'SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq'), - False) as output: + def test_fastq_validator_record_counts_bad(self, fastq_validator, tmp_path): + with _open_output_file( + tmp_path.joinpath("SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq"), False + ) as output: output.write(_GOOD_RECORDS) - with _open_output_file(tmp_path.joinpath( - 'SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq'), - False) as output: + with _open_output_file( + tmp_path.joinpath("SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq"), False + ) as output: output.write(_GOOD_RECORDS) output.write(_GOOD_RECORDS) diff --git a/tests/test_gz_validator.py b/tests/test_gz_validator.py index 0c6c8b2..ad748c4 100644 --- a/tests/test_gz_validator.py +++ b/tests/test_gz_validator.py @@ -1,19 +1,24 @@ -from pathlib import Path -import zipfile import re +import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), ( - ('test_data/fake_snrnaseq_tree_good.zip', []), - ('test_data/fake_snrnaseq_tree_bad.zip', ['.*text2.txt.gz is not a valid gzipped file']), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_re_list"), + ( + ("test_data/fake_snrnaseq_tree_good.zip", []), + ("test_data/fake_snrnaseq_tree_bad.zip", [".*text2.txt.gz is not a valid gzipped file"]), + ), +) def test_gz_validator(test_data_fname, msg_re_list, tmp_path): from gz_validator import GZValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = GZValidator(tmp_path / test_data_path.stem, 'snRNAseq') + validator = GZValidator(tmp_path / test_data_path.stem, "snRNAseq") errors = validator.collect_errors(coreuse=4)[:] assert len(msg_re_list) == len(errors) for err_str, re_str in zip(errors, msg_re_list): diff --git a/tests/test_ome_tiff_validator.py b/tests/test_ome_tiff_validator.py index 09e2163..89ad198 100644 --- a/tests/test_ome_tiff_validator.py +++ b/tests/test_ome_tiff_validator.py @@ -1,20 +1,27 @@ -from pathlib import Path -import zipfile import re +import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), ( - ('test_data/codex_tree_ometiff_bad.zip', - ['.*tubhiswt_C0_bad.ome.tif is not a valid OME.TIFF file.*']), - ('test_data/codex_tree_ometiff_good.zip',[]), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_re_list"), + ( + ( + "test_data/codex_tree_ometiff_bad.zip", + [".*tubhiswt_C0_bad.ome.tif is not a valid OME.TIFF file.*"], + ), + ("test_data/codex_tree_ometiff_good.zip", []), + ), +) def test_ome_tiff_validator(test_data_fname, msg_re_list, tmp_path): from ome_tiff_validator import OmeTiffValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = OmeTiffValidator(tmp_path / test_data_path.stem, 'CODEX') + validator = OmeTiffValidator(tmp_path / test_data_path.stem, "CODEX") errors = validator.collect_errors(coreuse=4)[:] assert len(msg_re_list) == len(errors) for err_str, re_str in zip(errors, msg_re_list): diff --git a/tests/test_publication_validator.py b/tests/test_publication_validator.py index ebb768c..50c3a0e 100644 --- a/tests/test_publication_validator.py +++ b/tests/test_publication_validator.py @@ -1,34 +1,46 @@ -from pathlib import Path -import zipfile import re +import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), ( - ('test_data/publication_tree_good.zip', []), - ('test_data/publication_tree_good_complex.zip', []), - ('test_data/publication_tree_bad_complex.zip', - [ - 'expected data file data/vignette_12/A/0/325b936e-4132-45fe-8674-9abbde568be8 is absent', - 'expected data file data/vignette_12/A/0/9db02302-07d9-4c54-ad45-4578c4822cce is absent', - 'expected data file data/vignette_12/A/1/90b3667d-3ccc-4241-9227-fee578d41bac is absent', - ]), - ('test_data/publication_tree_bad_1.zip', ['vignettes not found or not a directory']), - ('test_data/publication_tree_bad_2.zip', ['Found a subdirectory in a vignette']), - ('test_data/publication_tree_bad_3.zip', ['A vignette has more than one markdown file']), - ('test_data/publication_tree_bad_4.zip', ['figure dict does not provide a name']), - ('test_data/publication_tree_bad_5.zip', ['figure dict does not reference a file']), - ('test_data/publication_tree_bad_6.zip', ['unexpected files in vignette.*']), - ('test_data/publication_tree_bad_7.zip', ['expected data file' - ' data/codeluppi_2018_nature_methods.molecules.h5ad.zarr' - ' is absent']), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_re_list"), + ( + ("test_data/publication_tree_good.zip", []), + ("test_data/publication_tree_good_complex.zip", []), + ( + "test_data/publication_tree_bad_complex.zip", + [ + "expected data file data/vignette_12/A/0/325b936e-4132-45fe-8674-9abbde568be8 is absent", # noqa: E501 + "expected data file data/vignette_12/A/0/9db02302-07d9-4c54-ad45-4578c4822cce is absent", # noqa: E501 + "expected data file data/vignette_12/A/1/90b3667d-3ccc-4241-9227-fee578d41bac is absent", # noqa: E501 + ], + ), + ("test_data/publication_tree_bad_1.zip", ["vignettes not found or not a directory"]), + ("test_data/publication_tree_bad_2.zip", ["Found a subdirectory in a vignette"]), + ("test_data/publication_tree_bad_3.zip", ["A vignette has more than one markdown file"]), + ("test_data/publication_tree_bad_4.zip", ["figure dict does not provide a name"]), + ("test_data/publication_tree_bad_5.zip", ["figure dict does not reference a file"]), + ("test_data/publication_tree_bad_6.zip", ["unexpected files in vignette.*"]), + ( + "test_data/publication_tree_bad_7.zip", + [ + "expected data file" + " data/codeluppi_2018_nature_methods.molecules.h5ad.zarr" + " is absent" + ], + ), + ), +) def test_publication_validator(test_data_fname, msg_re_list, tmp_path): from publication_validator import PublicationValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = PublicationValidator(tmp_path / test_data_path.stem, 'Publication') + validator = PublicationValidator(tmp_path / test_data_path.stem, "Publication") errors = validator.collect_errors(coreuse=4)[:] print(f"errors: {errors}") matched_err_str_list = [] diff --git a/tests/test_tiff_validator.py b/tests/test_tiff_validator.py index c2d271b..4ebe66c 100644 --- a/tests/test_tiff_validator.py +++ b/tests/test_tiff_validator.py @@ -1,24 +1,32 @@ -from pathlib import Path -import zipfile import re +import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), ( - ('test_data/tiff_tree_good.zip', []), - ('test_data/tiff_tree_bad.zip', [ - '.*notatiff.tif is not a valid TIFF file', - '.*notatiff.tiff is not a valid TIFF file', - '.*notatiff.TIFF is not a valid TIFF file', - '.*notatiff.TIF is not a valid TIFF file', - ]), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_re_list"), + ( + ("test_data/tiff_tree_good.zip", []), + ( + "test_data/tiff_tree_bad.zip", + [ + ".*notatiff.tif is not a valid TIFF file", + ".*notatiff.tiff is not a valid TIFF file", + ".*notatiff.TIFF is not a valid TIFF file", + ".*notatiff.TIF is not a valid TIFF file", + ], + ), + ), +) def test_tiff_validator(test_data_fname, msg_re_list, tmp_path): from tiff_validator import TiffValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = TiffValidator(tmp_path / test_data_path.stem, 'codex') + validator = TiffValidator(tmp_path / test_data_path.stem, "codex") errors = validator.collect_errors(coreuse=4)[:] print(f"errors: {errors}") matched_err_str_list = [] From 174aae05eca907b1f24589040b2a5e300838c704 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 12:40:50 -0500 Subject: [PATCH 19/30] testing black gh action --- .github/workflows/black.yml | 14 +++++++++++++ README.md | 42 +++++++++++++++++++++++++++++++++++-- requirements-dev.txt | 2 +- 3 files changed, 55 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/black.yml diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml new file mode 100644 index 0000000..a4a0460 --- /dev/null +++ b/.github/workflows/black.yml @@ -0,0 +1,14 @@ +name: Lint + +on: [push, pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: psf/black@stable + with: + options: "--check --verbose --line-length 99" + src: "./src" + version: "~= 24.0" diff --git a/README.md b/README.md index 2db444a..033f5b4 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,47 @@ # ingest-validation-tests -This repository contains plug-in tests for use during validation of submissions. It is referenced by ingest-validation-tools. +This repository contains plug-in tests for use during validation of submissions. It is referenced by ingest-validation-tools. ## Development process +### Branches + - Make new feature branches from `devel`. - Make PRs to `devel`. (This is the default branch.) -- The last reviewer to approve a PR should merge it. At the moment that is likely to be @jswelling . +- The last reviewer to approve a PR should merge it. + +### Setup + +- Creating and activating a virtual environment is recommended. These instructions assume you are using a virtual environment. Example using venv: + +``` +python3.9 -m venv hm-ingest-validation-tests +source hm-ingest-validation-tests/bin/activate +``` + +- Run `pip install -r requirements-dev.txt` +- (optional) Integrate black with your editor. + - [Instructions for black.](https://black.readthedocs.io/en/stable/integrations/editors.html) + - If you choose not to integrate black with your editor, run the following from the base `ingest-validation-tests` directory before pushing code to GitHub: `black --line-length 99 .` + +### Testing + +- If ingest-validation-tools is not already set up: + +``` +# Starting from ingest-validation-tests... +cd .. +git clone https://github.com/hubmapconsortium/ingest-validation-tools.git +cd ingest-validation-tests +pip install -r ../ingest-validation-tools/requirements.txt +pip install -r ../ingest-validation-tools/requirements-dev.txt +``` + +- If ingest-validation-tools is already set up, add the appropriate ingest-validation-tools path and run + +``` +pip install -r /requirements.txt +pip install -r /requirements-dev.txt +``` + +- Run `test.sh` diff --git a/requirements-dev.txt b/requirements-dev.txt index d5c087a..2cd9d49 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,10 +1,10 @@ black==23.12.1 -flake8==7.0.0 git+https://github.com/hubmapconsortium/fastq-utils.git@v0.2.5#egg=hubmap-fastq-utils imagecodecs>=2023.3.16 isort==5.13.2 jsonschema==4.4.0 pandas>=1.2.0 +pytest==8.0.0 python-frontmatter>=1.0.0 tifffile==2020.10.1 xmlschema>=1.6 From 0dd0cb643a841301b7f1acea6e7c1891bfa70af6 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 12:53:29 -0500 Subject: [PATCH 20/30] linting change --- src/ingest_validation_tests/fastq_validator_logic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index 8095b74..440ceb4 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -122,9 +122,9 @@ def _validate_fastq_line_4(self, line: str) -> List[str]: def validate_fastq_record(self, line: str, line_number: int) -> List[str]: line_index = line_number % 4 + 1 - validator_method: Callable[ - [FASTQValidatorLogic, str], List[str] - ] = self._VALIDATE_FASTQ_LINE_METHODS[line_index] + validator_method: Callable[[FASTQValidatorLogic, str], List[str]] = ( + self._VALIDATE_FASTQ_LINE_METHODS[line_index] + ) assert validator_method, f"No validator method defined for record index {line_index}" From 303e87d8179cceb5b310d0faa64f76726b213d69 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 13:01:40 -0500 Subject: [PATCH 21/30] testing update to dismiss error about node version --- .github/workflows/black.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml index a4a0460..d8d7e0b 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/black.yml @@ -6,7 +6,7 @@ jobs: lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: psf/black@stable with: options: "--check --verbose --line-length 99" From edac78ced52c81068baa4504558d7bdd9e5eeeed Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 13:06:05 -0500 Subject: [PATCH 22/30] trying isort gh action --- .github/workflows/isort.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 .github/workflows/isort.yml diff --git a/.github/workflows/isort.yml b/.github/workflows/isort.yml new file mode 100644 index 0000000..7bfa84f --- /dev/null +++ b/.github/workflows/isort.yml @@ -0,0 +1,13 @@ +name: Run isort +on: + - push + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: isort/isort-action@v1 + with: + configuration: "--profile black" + requirements-files: "requirements.txt requirements-dev.txt" From 9d368d12c070eddb93dc35d96786b4a7740edd3f Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 13:49:30 -0500 Subject: [PATCH 23/30] testing isort 3 --- .github/workflows/isort.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/isort.yml b/.github/workflows/isort.yml index 7bfa84f..fdf5e4a 100644 --- a/.github/workflows/isort.yml +++ b/.github/workflows/isort.yml @@ -7,7 +7,10 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: isort/isort-action@v1 + - uses: actions/setup-python@v5 with: - configuration: "--profile black" - requirements-files: "requirements.txt requirements-dev.txt" + python-version: 3.9 + - uses: isort/isort-action@master + with: + configuration: "--profile black" + requirementsFiles: "requirements.txt requirements-test.txt" From 2f6f1d9cbc2070e63565625537acf7f013b97f2c Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 13:51:03 -0500 Subject: [PATCH 24/30] testing isort 4 --- .github/workflows/isort.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/isort.yml b/.github/workflows/isort.yml index fdf5e4a..52f9cf5 100644 --- a/.github/workflows/isort.yml +++ b/.github/workflows/isort.yml @@ -12,5 +12,5 @@ jobs: python-version: 3.9 - uses: isort/isort-action@master with: - configuration: "--profile black" + configuration: "--check-only --diff --profile black" requirementsFiles: "requirements.txt requirements-test.txt" From 4f9c09fb12e3cd645a5aa8c925c0106bcacdcdf4 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 14:10:23 -0500 Subject: [PATCH 25/30] updated actions to run on PR; updated README.md --- .github/workflows/black.yml | 3 +-- .github/workflows/isort.yml | 3 +-- README.md | 5 ++++- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml index d8d7e0b..7c02822 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/black.yml @@ -1,6 +1,5 @@ name: Lint - -on: [push, pull_request] +on: pull_request jobs: lint: diff --git a/.github/workflows/isort.yml b/.github/workflows/isort.yml index 52f9cf5..db105a8 100644 --- a/.github/workflows/isort.yml +++ b/.github/workflows/isort.yml @@ -1,6 +1,5 @@ name: Run isort -on: - - push +on: pull_request jobs: build: diff --git a/README.md b/README.md index 033f5b4..4bddaf1 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,10 @@ This repository contains plug-in tests for use during validation of submissions. ### Branches - Make new feature branches from `devel`. +- Before submitting a PR, make sure your code is black and isort compliant. Run the following from the base `ingest-validation-tests` directory: + - `black --line-length 99 .` (if you choose not to integrate black with your editor (see Setup section) + - `isort .` + - Make PRs to `devel`. (This is the default branch.) - The last reviewer to approve a PR should merge it. @@ -22,7 +26,6 @@ source hm-ingest-validation-tests/bin/activate - Run `pip install -r requirements-dev.txt` - (optional) Integrate black with your editor. - [Instructions for black.](https://black.readthedocs.io/en/stable/integrations/editors.html) - - If you choose not to integrate black with your editor, run the following from the base `ingest-validation-tests` directory before pushing code to GitHub: `black --line-length 99 .` ### Testing From a029b0061d251cba10393e0be7843e2a86fc9a00 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 14:35:56 -0500 Subject: [PATCH 26/30] cleanup --- .flake8 | 3 --- .gitignore | 1 + README.md | 48 +++++++++++++++++++++++++++--------------------- 3 files changed, 28 insertions(+), 24 deletions(-) delete mode 100644 .flake8 diff --git a/.flake8 b/.flake8 deleted file mode 100644 index ad98539..0000000 --- a/.flake8 +++ /dev/null @@ -1,3 +0,0 @@ -[flake8] -max-line-length = 100 -extend-ignore = E203 diff --git a/.gitignore b/.gitignore index da9fde3..79cb0cf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .vscode/ +.flake8 # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index 4bddaf1..b0f37ab 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,13 @@ This repository contains plug-in tests for use during validation of submissions. - Make new feature branches from `devel`. - Before submitting a PR, make sure your code is black and isort compliant. Run the following from the base `ingest-validation-tests` directory: - - `black --line-length 99 .` (if you choose not to integrate black with your editor (see Setup section) - - `isort .` + + ``` + black --line-length 99 . + isort --profile black --multi-line 3 . + ``` + + (You can integrate black and potentially isort with your editor to skip this step, see Setup section below) - Make PRs to `devel`. (This is the default branch.) - The last reviewer to approve a PR should merge it. @@ -18,33 +23,34 @@ This repository contains plug-in tests for use during validation of submissions. - Creating and activating a virtual environment is recommended. These instructions assume you are using a virtual environment. Example using venv: -``` -python3.9 -m venv hm-ingest-validation-tests -source hm-ingest-validation-tests/bin/activate -``` + ``` + python3.9 -m venv hm-ingest-validation-tests + source hm-ingest-validation-tests/bin/activate + ``` - Run `pip install -r requirements-dev.txt` - (optional) Integrate black with your editor. - [Instructions for black.](https://black.readthedocs.io/en/stable/integrations/editors.html) +- (optional) Integrate [isort](https://pycqa.github.io/isort/) with your editor. ### Testing - If ingest-validation-tools is not already set up: -``` -# Starting from ingest-validation-tests... -cd .. -git clone https://github.com/hubmapconsortium/ingest-validation-tools.git -cd ingest-validation-tests -pip install -r ../ingest-validation-tools/requirements.txt -pip install -r ../ingest-validation-tools/requirements-dev.txt -``` - -- If ingest-validation-tools is already set up, add the appropriate ingest-validation-tools path and run - -``` -pip install -r /requirements.txt -pip install -r /requirements-dev.txt -``` + ``` + # Starting from ingest-validation-tests... + cd .. + git clone https://github.com/hubmapconsortium/ingest-validation-tools.git + cd ingest-validation-tests + pip install -r ../ingest-validation-tools/requirements.txt + pip install -r ../ingest-validation-tools/requirements-dev.txt + ``` + +- If ingest-validation-tools is already set up, add the appropriate ingest-validation-tools path and run: + + ``` + pip install -r /requirements.txt + pip install -r /requirements-dev.txt + ``` - Run `test.sh` From 9604ce7a38b7a20ebfa37c93c95bffbdca0ed045 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 14:42:24 -0500 Subject: [PATCH 27/30] cleanup --- .github/workflows/isort.yml | 4 ++-- pyproject.toml | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/isort.yml b/.github/workflows/isort.yml index db105a8..2ef4557 100644 --- a/.github/workflows/isort.yml +++ b/.github/workflows/isort.yml @@ -11,5 +11,5 @@ jobs: python-version: 3.9 - uses: isort/isort-action@master with: - configuration: "--check-only --diff --profile black" - requirementsFiles: "requirements.txt requirements-test.txt" + configuration: "--check-only --diff --profile black" + requirementsFiles: "requirements.txt requirements-test.txt" diff --git a/pyproject.toml b/pyproject.toml index 72dd8d9..93863fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,4 +4,3 @@ line-length = 99 [tool.isort] profile = "black" multi_line_output = 3 -src_paths = ["src/ingest-validation-tests"] From e30f4263b8b9c44c98262fc2f8fce199b209b33c Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 1 Feb 2024 14:45:11 -0500 Subject: [PATCH 28/30] oops forgot to update test file --- tests/test_fastq_validator_logic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_fastq_validator_logic.py b/tests/test_fastq_validator_logic.py index c554d6f..31bd68a 100644 --- a/tests/test_fastq_validator_logic.py +++ b/tests/test_fastq_validator_logic.py @@ -3,6 +3,7 @@ from typing import TextIO import pytest + from src.ingest_validation_tests.fastq_validator_logic import FASTQValidatorLogic _GOOD_RECORDS = """\ From 99858a1dfca86648c83b1f4b9241bd11d59c8b5d Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Fri, 2 Feb 2024 13:46:17 -0500 Subject: [PATCH 29/30] simplifying workflows yml file, adding flake8 --- .flake8 | 3 +++ .github/workflows/black.yml | 13 ------------- .github/workflows/isort.yml | 15 --------------- .github/workflows/linting.yml | 32 ++++++++++++++++++++++++++++++++ .gitignore | 1 - README.md | 6 ++++-- requirements-dev.txt | 1 + 7 files changed, 40 insertions(+), 31 deletions(-) create mode 100644 .flake8 delete mode 100644 .github/workflows/black.yml delete mode 100644 .github/workflows/isort.yml create mode 100644 .github/workflows/linting.yml diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..70fc30a --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +extend-ignore = E203,E501,W503 +max-line-length = 99 diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml deleted file mode 100644 index 7c02822..0000000 --- a/.github/workflows/black.yml +++ /dev/null @@ -1,13 +0,0 @@ -name: Lint -on: pull_request - -jobs: - lint: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: psf/black@stable - with: - options: "--check --verbose --line-length 99" - src: "./src" - version: "~= 24.0" diff --git a/.github/workflows/isort.yml b/.github/workflows/isort.yml deleted file mode 100644 index 2ef4557..0000000 --- a/.github/workflows/isort.yml +++ /dev/null @@ -1,15 +0,0 @@ -name: Run isort -on: pull_request - -jobs: - build: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: 3.9 - - uses: isort/isort-action@master - with: - configuration: "--check-only --diff --profile black" - requirementsFiles: "requirements.txt requirements-test.txt" diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml new file mode 100644 index 0000000..f34ebe8 --- /dev/null +++ b/.github/workflows/linting.yml @@ -0,0 +1,32 @@ +name: Linters + +on: + push: + pull_request: + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10"] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 black isort + - name: Flake8 Lint + run: | + flake8 --ignore=E501,W503,E203 + - name: Black Lint + run: | + black --line-length 99 --check --verbose + - name: isort Lint + run: | + isort --profile black --check-only --diff diff --git a/.gitignore b/.gitignore index 79cb0cf..da9fde3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ .vscode/ -.flake8 # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index b0f37ab..0cca7ba 100644 --- a/README.md +++ b/README.md @@ -7,14 +7,15 @@ This repository contains plug-in tests for use during validation of submissions. ### Branches - Make new feature branches from `devel`. -- Before submitting a PR, make sure your code is black and isort compliant. Run the following from the base `ingest-validation-tests` directory: +- Before submitting a PR, make sure your code is black, isort, and flake8 compliant. Run the following from the base `ingest-validation-tests` directory: ``` black --line-length 99 . isort --profile black --multi-line 3 . + flake8 ``` - (You can integrate black and potentially isort with your editor to skip this step, see Setup section below) + (Integrating black and potentially isort/flake8 with your editor may allow you to skip this step, see Setup section below.) - Make PRs to `devel`. (This is the default branch.) - The last reviewer to approve a PR should merge it. @@ -32,6 +33,7 @@ This repository contains plug-in tests for use during validation of submissions. - (optional) Integrate black with your editor. - [Instructions for black.](https://black.readthedocs.io/en/stable/integrations/editors.html) - (optional) Integrate [isort](https://pycqa.github.io/isort/) with your editor. +- (optional) Integrate [flake8](https://flake8.pycqa.org/en/latest/index.html) with your editor. ### Testing diff --git a/requirements-dev.txt b/requirements-dev.txt index 2cd9d49..ba6f71b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,5 @@ black==23.12.1 +flake8==7.0.0 git+https://github.com/hubmapconsortium/fastq-utils.git@v0.2.5#egg=hubmap-fastq-utils imagecodecs>=2023.3.16 isort==5.13.2 From 67436d4e59399b25fce5d76d159edc85a42b9ca2 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Fri, 2 Feb 2024 13:47:38 -0500 Subject: [PATCH 30/30] fixing mistake --- .github/workflows/linting.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index f34ebe8..b4fb9d4 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -23,10 +23,10 @@ jobs: pip install flake8 black isort - name: Flake8 Lint run: | - flake8 --ignore=E501,W503,E203 + flake8 --ignore=E501,W503,E203 . - name: Black Lint run: | - black --line-length 99 --check --verbose + black --line-length 99 --check --verbose . - name: isort Lint run: | - isort --profile black --check-only --diff + isort --profile black --check-only --diff .