diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..70fc30a --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +extend-ignore = E203,E501,W503 +max-line-length = 99 diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml new file mode 100644 index 0000000..b4fb9d4 --- /dev/null +++ b/.github/workflows/linting.yml @@ -0,0 +1,32 @@ +name: Linters + +on: + push: + pull_request: + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10"] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 black isort + - name: Flake8 Lint + run: | + flake8 --ignore=E501,W503,E203 . + - name: Black Lint + run: | + black --line-length 99 --check --verbose . + - name: isort Lint + run: | + isort --profile black --check-only --diff . diff --git a/.gitignore b/.gitignore index 916db7b..da9fde3 100644 --- a/.gitignore +++ b/.gitignore @@ -105,6 +105,7 @@ celerybeat.pid # Environments .env +.envrc .venv env/ venv/ diff --git a/README.md b/README.md index 2db444a..0cca7ba 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,58 @@ # ingest-validation-tests -This repository contains plug-in tests for use during validation of submissions. It is referenced by ingest-validation-tools. +This repository contains plug-in tests for use during validation of submissions. It is referenced by ingest-validation-tools. ## Development process +### Branches + - Make new feature branches from `devel`. +- Before submitting a PR, make sure your code is black, isort, and flake8 compliant. Run the following from the base `ingest-validation-tests` directory: + + ``` + black --line-length 99 . + isort --profile black --multi-line 3 . + flake8 + ``` + + (Integrating black and potentially isort/flake8 with your editor may allow you to skip this step, see Setup section below.) + - Make PRs to `devel`. (This is the default branch.) -- The last reviewer to approve a PR should merge it. At the moment that is likely to be @jswelling . +- The last reviewer to approve a PR should merge it. + +### Setup + +- Creating and activating a virtual environment is recommended. These instructions assume you are using a virtual environment. Example using venv: + + ``` + python3.9 -m venv hm-ingest-validation-tests + source hm-ingest-validation-tests/bin/activate + ``` + +- Run `pip install -r requirements-dev.txt` +- (optional) Integrate black with your editor. + - [Instructions for black.](https://black.readthedocs.io/en/stable/integrations/editors.html) +- (optional) Integrate [isort](https://pycqa.github.io/isort/) with your editor. +- (optional) Integrate [flake8](https://flake8.pycqa.org/en/latest/index.html) with your editor. + +### Testing + +- If ingest-validation-tools is not already set up: + + ``` + # Starting from ingest-validation-tests... + cd .. + git clone https://github.com/hubmapconsortium/ingest-validation-tools.git + cd ingest-validation-tests + pip install -r ../ingest-validation-tools/requirements.txt + pip install -r ../ingest-validation-tools/requirements-dev.txt + ``` + +- If ingest-validation-tools is already set up, add the appropriate ingest-validation-tools path and run: + + ``` + pip install -r /requirements.txt + pip install -r /requirements-dev.txt + ``` + +- Run `test.sh` diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..93863fa --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,6 @@ +[tool.black] +line-length = 99 + +[tool.isort] +profile = "black" +multi_line_output = 3 diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..ba6f71b --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,11 @@ +black==23.12.1 +flake8==7.0.0 +git+https://github.com/hubmapconsortium/fastq-utils.git@v0.2.5#egg=hubmap-fastq-utils +imagecodecs>=2023.3.16 +isort==5.13.2 +jsonschema==4.4.0 +pandas>=1.2.0 +pytest==8.0.0 +python-frontmatter>=1.0.0 +tifffile==2020.10.1 +xmlschema>=1.6 diff --git a/src/ingest_validation_tests/codex_common_errors_validator.py b/src/ingest_validation_tests/codex_common_errors_validator.py index 95bdad2..651b5b9 100644 --- a/src/ingest_validation_tests/codex_common_errors_validator.py +++ b/src/ingest_validation_tests/codex_common_errors_validator.py @@ -28,18 +28,12 @@ def _split_cycle_dir_string(cycle_str): try: cyc_id = int(words[0][len("cyc") :]) except ValueError: - raise AssertionError( - f'Directory string "{cycle_str}" cycle number is' " not an integer" - ) - assert words[1].startswith("reg"), ( - f'Directory string "{cycle_str}" does' ' not include "_reg"' - ) + raise AssertionError(f'Directory string "{cycle_str}" cycle number is' " not an integer") + assert words[1].startswith("reg"), f'Directory string "{cycle_str}" does' ' not include "_reg"' try: reg_id = int(words[1][len("reg") :]) except ValueError: - raise AssertionError( - f'Directory string "{cycle_str}" region number is' " not an integer" - ) + raise AssertionError(f'Directory string "{cycle_str}" region number is' " not an integer") return cyc_id, reg_id @@ -86,9 +80,7 @@ def collect_errors(self, **kwargs) -> List[str]: self._log("FOUND dataset.json; skipping further analysis") raise QuitNowException() elif any_dataset_json_exists: - rslt.append( - "A dataset.json file exists but" " is in the wrong place" - ) + rslt.append("A dataset.json file exists but" " is in the wrong place") # is the segmentation.json file on the right side? found = False @@ -102,9 +94,7 @@ def collect_errors(self, **kwargs) -> List[str]: if right_place: pass else: - rslt.append( - "The segmentation.json file is in the wrong subdirectory" - ) + rslt.append("The segmentation.json file is in the wrong subdirectory") else: rslt.append("The segmentation.json file is missing or misplaced") @@ -145,8 +135,7 @@ def collect_errors(self, **kwargs) -> List[str]: raise QuitNowException() if len(rpt_df.columns) != 2: rslt.append( - f"Could not parse {report_csv_path}." - " Is it a comma-separated table?" + f"Could not parse {report_csv_path}." " Is it a comma-separated table?" ) raise QuitNowException() col_0, col_1 = rpt_df.columns @@ -209,9 +198,7 @@ def collect_errors(self, **kwargs) -> List[str]: # excluding any HandE channels total_channel_count = len(cn_df) h_and_e_channel_count = len(cn_df[cn_df[0].str.startswith("HandE")]) - channels_per_cycle = ( - total_channel_count - h_and_e_channel_count - ) / len(cycles) + channels_per_cycle = (total_channel_count - h_and_e_channel_count) / len(cycles) if channels_per_cycle != int(channels_per_cycle): failures.append("The number of channels per cycle is not constant") if failures: diff --git a/src/ingest_validation_tests/codex_json_validator.py b/src/ingest_validation_tests/codex_json_validator.py index 1e9e9ba..41a6955 100644 --- a/src/ingest_validation_tests/codex_json_validator.py +++ b/src/ingest_validation_tests/codex_json_validator.py @@ -12,19 +12,19 @@ class CodexJsonValidator(Validator): def collect_errors(self, **kwargs) -> List[str]: del kwargs - if 'codex' not in self.assay_type.lower(): + if "codex" not in self.assay_type.lower(): return [] - schema_path = Path(__file__).parent / 'codex_schema.json' + schema_path = Path(__file__).parent / "codex_schema.json" schema = json.loads(schema_path.read_text()) rslt = [] - for glob_expr in ['**/dataset.json']: + for glob_expr in ["**/dataset.json"]: for path in self.paths: for file in path.glob(glob_expr): instance = json.loads(file.read_text()) try: validate(instance=instance, schema=schema) except Exception as e: - rslt.append(f'{file}: {e}') + rslt.append(f"{file}: {e}") return rslt diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index 2a98e43..efd43ce 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -85,9 +85,7 @@ def _validate_fastq_line_2(self, line: str) -> List[str]: self._line_2_length = len(line) self._last_line_2_number = self._line_number - invalid_chars = "".join( - c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS - ) + invalid_chars = "".join(c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS) if invalid_chars: return [f"Line contains invalid character(s): {invalid_chars}"] @@ -103,9 +101,7 @@ def _validate_fastq_line_4(self, line: str) -> List[str]: errors: List[str] = [] invalid_chars = "".join(c for c in line if not 33 <= ord(c) <= 126) if invalid_chars: - errors.append( - "Line contains invalid quality character(s): " f'"{invalid_chars}"' - ) + errors.append("Line contains invalid quality character(s): " f'"{invalid_chars}"') if len(line) != self._line_2_length: errors.append( @@ -129,9 +125,7 @@ def validate_fastq_record(self, line: str, line_number: int) -> List[str]: self._VALIDATE_FASTQ_LINE_METHODS[line_index] ) - assert ( - validator_method - ), f"No validator method defined for record index {line_index}" + assert validator_method, f"No validator method defined for record index {line_index}" return validator_method(self, line) @@ -155,9 +149,7 @@ def validate_fastq_file(self, fastq_file: Path) -> None: if not is_valid_filename(fastq_file.name): # If we don't like the filename, don't bother reading the contents. - self.errors.append( - "Filename does not have proper format " "and will not be processed" - ) + self.errors.append("Filename does not have proper format " "and will not be processed") return self._line_number = 0 @@ -170,9 +162,7 @@ def validate_fastq_file(self, fastq_file: Path) -> None: self.errors.append(self._format_error(f"Bad gzip file: {fastq_file}.")) return except IOError: - self.errors.append( - self._format_error(f"Unable to open FASTQ data file {fastq_file}.") - ) + self.errors.append(self._format_error(f"Unable to open FASTQ data file {fastq_file}.")) return self._file_record_counts[str(fastq_file)] = records_read @@ -272,9 +262,7 @@ def main(): elif isinstance(args.filepaths, str): filepaths = [Path(args.filepaths)] else: - raise Exception( - f"Validator init received base_paths arg as type {type(args.filepaths)}" - ) + raise Exception(f"Validator init received base_paths arg as type {type(args.filepaths)}") validator = FASTQValidatorLogic(True) validator.validate_fastq_files_in_path(filepaths, Lock()) diff --git a/src/ingest_validation_tests/gz_validator.py b/src/ingest_validation_tests/gz_validator.py index cccc908..b0953eb 100644 --- a/src/ingest_validation_tests/gz_validator.py +++ b/src/ingest_validation_tests/gz_validator.py @@ -1,9 +1,9 @@ +import gzip +import re from multiprocessing import Pool from os import cpu_count -import re from typing import List -import gzip from ingest_validation_tools.plugin_validator import Validator diff --git a/src/ingest_validation_tests/publication_validator.py b/src/ingest_validation_tests/publication_validator.py index 4c7ccc1..528a2b5 100644 --- a/src/ingest_validation_tests/publication_validator.py +++ b/src/ingest_validation_tests/publication_validator.py @@ -2,10 +2,11 @@ Test for some common errors in the directory and file structure of publications. """ -from typing import List -import re import json +import re from pathlib import Path +from typing import List + import frontmatter from ingest_validation_tools.plugin_validator import Validator @@ -15,50 +16,57 @@ class PublicationValidator(Validator): Test for some common errors in the directory and file structure of publications. """ + description = "Test for common problems found in publications" cost = 1.0 - base_url_re = r'(\s*\{\{\s*base_url\s*\}\})/(.*)' - url_re = r'[Uu][Rr][Ll]' + base_url_re = r"(\s*\{\{\s*base_url\s*\}\})/(.*)" + url_re = r"[Uu][Rr][Ll]" def collect_errors(self, **kwargs) -> List[str]: """ Return the errors found by this validator """ del kwargs - if self.assay_type != 'Publication': + if self.assay_type != "Publication": return [] # We only test Publication data rslt = [] for path in self.paths: try: - vignette_path = path / 'vignettes' - assert vignette_path.is_dir(), 'vignettes not found or not a directory' - for this_vignette_path in vignette_path.glob('*'): - assert this_vignette_path.is_dir(), (f"Found the non-dir {this_vignette_path}" - " in vignettes") - this_vignette_all_paths = set(this_vignette_path.glob('*')) + vignette_path = path / "vignettes" + assert vignette_path.is_dir(), "vignettes not found or not a directory" + for this_vignette_path in vignette_path.glob("*"): + assert this_vignette_path.is_dir(), ( + f"Found the non-dir {this_vignette_path}" " in vignettes" + ) + this_vignette_all_paths = set(this_vignette_path.glob("*")) if not all(pth.is_file() for pth in this_vignette_all_paths): - raise AssertionError('Found a subdirectory in a vignette') + raise AssertionError("Found a subdirectory in a vignette") md_found = False vig_figures = [] - for md_path in this_vignette_path.glob('*.md'): + for md_path in this_vignette_path.glob("*.md"): if md_found: - raise AssertionError('A vignette has more than one markdown file') + raise AssertionError("A vignette has more than one markdown file") else: md_found = True vig_fm = frontmatter.loads(md_path.read_text()) - for key in ['name', 'figures']: - assert key in vig_fm.metadata, ('vignette markdown is incorrectly' - f' formatted or has no {key}') - for fig_dict in vig_fm.metadata['figures']: - assert 'file' in fig_dict, 'figure dict does not reference a file' - assert 'name' in fig_dict, 'figure dict does not provide a name' - vig_figures.append(fig_dict['file']) + for key in ["name", "figures"]: + assert key in vig_fm.metadata, ( + "vignette markdown is incorrectly" f" formatted or has no {key}" + ) + for fig_dict in vig_fm.metadata["figures"]: + assert "file" in fig_dict, "figure dict does not reference a file" + assert "name" in fig_dict, "figure dict does not provide a name" + vig_figures.append(fig_dict["file"]) this_vignette_all_paths.remove(md_path) for fname in vig_figures: - rslt.extend(self.validate_vitessce_config(this_vignette_path / fname, path)) + rslt.extend( + self.validate_vitessce_config(this_vignette_path / fname, path) + ) this_vignette_all_paths.remove(this_vignette_path / fname) - assert not this_vignette_all_paths, ('unexpected files in vignette:' - f' {list(str(elt) for elt in this_vignette_all_paths)}') + assert not this_vignette_all_paths, ( + "unexpected files in vignette:" + f" {list(str(elt) for elt in this_vignette_all_paths)}" + ) except AssertionError as excp: rslt.append(str(excp)) @@ -94,9 +102,10 @@ def validate_vitessce_config(self, json_path, path): match = re.match(self.base_url_re, val) if match: # it starts with {{ base_url }} extra_url = match.group(2) - data_path = path / 'data' / extra_url - assert data_path.exists(), ("expected data file" - f" {Path('data') / extra_url} is absent") + data_path = path / "data" / extra_url + assert data_path.exists(), ( + "expected data file" f" {Path('data') / extra_url} is absent" + ) except AssertionError as excp: rslt.append(str(excp)) diff --git a/src/ingest_validation_tests/tiff_validator.py b/src/ingest_validation_tests/tiff_validator.py index d98e3ec..0697b46 100644 --- a/src/ingest_validation_tests/tiff_validator.py +++ b/src/ingest_validation_tests/tiff_validator.py @@ -32,13 +32,15 @@ class TiffValidator(Validator): cost = 1.0 def collect_errors(self, **kwargs) -> List[str]: - threads = kwargs.get('coreuse', None) or cpu_count() // 4 or 1 + threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1 pool = Pool(threads) filenames_to_test = [] - for glob_expr in ['**/*.tif', '**/*.tiff', '**/*.TIFF', '**/*.TIF']: + for glob_expr in ["**/*.tif", "**/*.tiff", "**/*.TIFF", "**/*.TIF"]: for path in self.paths: for file in path.glob(glob_expr): filenames_to_test.append(file) - return list(rslt for rslt in pool.imap_unordered(_check_tiff_file, - filenames_to_test) - if rslt is not None) + return list( + rslt + for rslt in pool.imap_unordered(_check_tiff_file, filenames_to_test) + if rslt is not None + ) diff --git a/tests/pytest_runner.py b/tests/pytest_runner.py index 0743dad..ac7e5a3 100644 --- a/tests/pytest_runner.py +++ b/tests/pytest_runner.py @@ -1,12 +1,15 @@ import sys from pathlib import Path + import pytest -class add_path(): + +class add_path: """ Add an element to sys.path using a context. Thanks to Eugene Yarmash https://stackoverflow.com/a/39855753 """ + def __init__(self, path): self.path = path @@ -22,16 +25,13 @@ def __exit__(self, exc_type, exc_value, traceback): def main(): if len(sys.argv) != 2: - sys.exit(f'usage: {sys.argv[0]} path-to-ingest-validation-tools') - tools_path = Path(sys.argv[1]).resolve() / 'src' - plugins_path = (Path(__file__).resolve().parent.parent - / 'src' - / 'ingest_validation_tests' - ) + sys.exit(f"usage: {sys.argv[0]} path-to-ingest-validation-tools") + tools_path = Path(sys.argv[1]).resolve() / "src" + plugins_path = Path(__file__).resolve().parent.parent / "src" / "ingest_validation_tests" with add_path(str(tools_path)): with add_path(str(plugins_path)): - sys.exit(pytest.main(['-vv'])) + sys.exit(pytest.main(["-vv"])) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tests/test_codex_common_errors_validator.py b/tests/test_codex_common_errors_validator.py index 23cd5e7..68d1d82 100644 --- a/tests/test_codex_common_errors_validator.py +++ b/tests/test_codex_common_errors_validator.py @@ -1,53 +1,84 @@ -from pathlib import Path import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_starts_list'), ( - ('test_data/fake_codex_tree_0.zip', ['Unexpected error reading']), - ('test_data/fake_codex_tree_1.zip', ['The segmentation.json file is in', - 'Unexpected error reading']), - ('test_data/fake_codex_tree_2.zip', ['The raw/src_ subdirectory is missing?']), - ('test_data/fake_codex_tree_3.zip', ['channelnames.txt is missing']), - ('test_data/fake_codex_tree_4.zip', ['Unexpected error reading']), - ('test_data/fake_codex_tree_5.zip', ['channelnames.txt does not match channelnames_report.txt' - ' on line 1: HLADR vs HLA-DR', - 'channelnames.txt does not match channelnames_report.txt' - ' on line 6: Empty vs Blank']), - ('test_data/fake_codex_tree_6.zip', ['Could not parse ']), - ('test_data/fake_codex_tree_7.zip', []), - ('test_data/fake_codex_tree_8.zip', ['Region numbers are not contiguous']), - ('test_data/fake_codex_tree_9.zip', ['Cycle numbers are not contiguous', - 'The number of channels per cycle is not constant']), - ('test_data/fake_codex_tree_10.zip', ['Directory string "cyc0a3_reg001_211119_040351"' - ' cycle number is not an integer']), - ('test_data/fake_codex_tree_11.zip', ['Directory string "cyc003_reg0a1_211119_040351"' - ' region number is not an integer']), - ('test_data/fake_codex_tree_12.zip', ['Directory string "cyc002_rig001_211119_040351"' - ' does not include "_reg"']), - ('test_data/fake_codex_tree_13.zip', ['Cycle numbering does not start at 1']), - ('test_data/fake_codex_tree_14.zip', ['Region numbering does not start at 1']), - ('test_data/fake_codex_tree_15.zip', ['Not all cycle/region pairs are present', - 'The number of channels per cycle is not constant']), - ('test_data/fake_codex_tree_16.zip', []), - ('test_data/fake_codex_tree_17.zip', ['A dataset.json file exists but is in the wrong place', - 'Region numbering does not start at 1']), - ('test_data/fake_codex_tree_18.zip', ['The number of channels per cycle is not constant']), - ('test_data/fake_codex_tree_19.zip', []), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_starts_list"), + ( + ("test_data/fake_codex_tree_0.zip", ["Unexpected error reading"]), + ( + "test_data/fake_codex_tree_1.zip", + ["The segmentation.json file is in", "Unexpected error reading"], + ), + ("test_data/fake_codex_tree_2.zip", ["The raw/src_ subdirectory is missing?"]), + ("test_data/fake_codex_tree_3.zip", ["channelnames.txt is missing"]), + ("test_data/fake_codex_tree_4.zip", ["Unexpected error reading"]), + ( + "test_data/fake_codex_tree_5.zip", + [ + "channelnames.txt does not match channelnames_report.txt" + " on line 1: HLADR vs HLA-DR", + "channelnames.txt does not match channelnames_report.txt" + " on line 6: Empty vs Blank", + ], + ), + ("test_data/fake_codex_tree_6.zip", ["Could not parse "]), + ("test_data/fake_codex_tree_7.zip", []), + ("test_data/fake_codex_tree_8.zip", ["Region numbers are not contiguous"]), + ( + "test_data/fake_codex_tree_9.zip", + [ + "Cycle numbers are not contiguous", + "The number of channels per cycle is not constant", + ], + ), + ( + "test_data/fake_codex_tree_10.zip", + ['Directory string "cyc0a3_reg001_211119_040351"' " cycle number is not an integer"], + ), + ( + "test_data/fake_codex_tree_11.zip", + ['Directory string "cyc003_reg0a1_211119_040351"' " region number is not an integer"], + ), + ( + "test_data/fake_codex_tree_12.zip", + ['Directory string "cyc002_rig001_211119_040351"' ' does not include "_reg"'], + ), + ("test_data/fake_codex_tree_13.zip", ["Cycle numbering does not start at 1"]), + ("test_data/fake_codex_tree_14.zip", ["Region numbering does not start at 1"]), + ( + "test_data/fake_codex_tree_15.zip", + [ + "Not all cycle/region pairs are present", + "The number of channels per cycle is not constant", + ], + ), + ("test_data/fake_codex_tree_16.zip", []), + ( + "test_data/fake_codex_tree_17.zip", + [ + "A dataset.json file exists but is in the wrong place", + "Region numbering does not start at 1", + ], + ), + ("test_data/fake_codex_tree_18.zip", ["The number of channels per cycle is not constant"]), + ("test_data/fake_codex_tree_19.zip", []), + ), +) def test_codex_common_errors_validator(test_data_fname, msg_starts_list, tmp_path): from codex_common_errors_validator import CodexCommonErrorsValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = CodexCommonErrorsValidator([Path(tmp_path / test_data_path.stem)], - 'CODEX' - ) + validator = CodexCommonErrorsValidator([Path(tmp_path / test_data_path.stem)], "CODEX") errors = validator.collect_errors()[:] - print(f'ERRORS FOLLOW FOR {test_data_fname}') + print(f"ERRORS FOLLOW FOR {test_data_fname}") for err in errors: print(err) - print('ERRORS ABOVE') + print("ERRORS ABOVE") assert len(msg_starts_list) == len(errors) for err_str, expected_str in zip(errors, msg_starts_list): assert err_str.startswith(expected_str) diff --git a/tests/test_codex_json_validator.py b/tests/test_codex_json_validator.py index a2998ff..4539270 100644 --- a/tests/test_codex_json_validator.py +++ b/tests/test_codex_json_validator.py @@ -1,22 +1,27 @@ -from pathlib import Path -import zipfile import re +import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), ( - ('test_data/good_codex_akoya_directory_v1_with_dataset_json_fails.zip', - [".*is not of type 'object'.*"]), - ('test_data/good_codex_akoya_directory_v1_with_dataset_json_passes.zip', []), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_re_list"), + ( + ( + "test_data/good_codex_akoya_directory_v1_with_dataset_json_fails.zip", + [".*is not of type 'object'.*"], + ), + ("test_data/good_codex_akoya_directory_v1_with_dataset_json_passes.zip", []), + ), +) def test_codex_json_validator(test_data_fname, msg_re_list, tmp_path): from codex_json_validator import CodexJsonValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = CodexJsonValidator(tmp_path / test_data_path.stem, - 'CODEX' - ) + validator = CodexJsonValidator(tmp_path / test_data_path.stem, "CODEX") errors = validator.collect_errors()[:] assert len(msg_re_list) == len(errors) for err_str, expected_re in zip(errors, msg_re_list): diff --git a/tests/test_fastq_validator_logic.py b/tests/test_fastq_validator_logic.py index a244cd8..31bd68a 100644 --- a/tests/test_fastq_validator_logic.py +++ b/tests/test_fastq_validator_logic.py @@ -1,36 +1,31 @@ -from multiprocessing import Lock +import gzip from pathlib import Path from typing import TextIO -import gzip import pytest -from src.ingest_validation_tests.fastq_validator_logic import \ - FASTQValidatorLogic +from src.ingest_validation_tests.fastq_validator_logic import FASTQValidatorLogic -_GOOD_RECORDS = '''\ +_GOOD_RECORDS = """\ @A12345:123:A12BCDEFG:1:1234:1000:1234 1:N:0:NACTGACTGA+CTGACTGACT NACTGACTGA + #FFFFFFFFF -''' +""" _GOOD_QUALITY_RECORD = ( - '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ' - r'[\]^_`abcdefghijklmnopqrstuvwxyz{|}~' + "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ" + r"[\]^_`abcdefghijklmnopqrstuvwxyz{|}~" ) _GOOD_SEQUENCE_FOR_QUALITY = ( - 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' - 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" ) def _open_output_file(filename: Path, use_gzip: bool) -> TextIO: - return ( - gzip.open(filename, 'wt') if use_gzip - else open(filename, 'wt') - ) + return gzip.open(filename, "wt") if use_gzip else open(filename, "wt") class TestFASTQValidatorLogic: @@ -46,33 +41,29 @@ def test_fastq_validator_no_files(self, fastq_validator, tmp_path): def test_fastq_validator_bad_gzip_data(self, fastq_validator, tmp_path): # Note that the filename ends in .gz, although it will not contain # compressed data. - test_file = tmp_path.joinpath('test.fastq.gz') + test_file = tmp_path.joinpath("test.fastq.gz") with _open_output_file(test_file, False) as output: output.write(_GOOD_RECORDS) fastq_validator.validate_fastq_file(test_file) assert "Bad gzip file" in fastq_validator.errors[0] - def test_fastq_validator_unrecognized_file(self, fastq_validator, - tmp_path): - test_file = tmp_path.joinpath('test.txt') + def test_fastq_validator_unrecognized_file(self, fastq_validator, tmp_path): + test_file = tmp_path.joinpath("test.txt") with _open_output_file(test_file, False) as output: output.write(_GOOD_RECORDS) fastq_validator.validate_fastq_file(test_file) - assert "Filename does not have proper format" in \ - fastq_validator.errors[0] + assert "Filename does not have proper format" in fastq_validator.errors[0] - def test_fastq_validator_empty_directory(self, fastq_validator, - tmp_path): + def test_fastq_validator_empty_directory(self, fastq_validator, tmp_path): fastq_validator.validate_fastq_files_in_path([tmp_path], 2) # No files in path means no errors assert fastq_validator.errors == [] @pytest.mark.parametrize("use_gzip", [False, True]) def test_fastq_validator_basic(self, fastq_validator, tmp_path, use_gzip): - test_file = tmp_path.joinpath('test.fastq.gz' if use_gzip - else 'test.fastq') + test_file = tmp_path.joinpath("test.fastq.gz" if use_gzip else "test.fastq") with _open_output_file(test_file, use_gzip) as output: output.write(_GOOD_RECORDS) @@ -80,9 +71,9 @@ def test_fastq_validator_basic(self, fastq_validator, tmp_path, use_gzip): assert not fastq_validator.errors def test_fastq_validator_bad_file(self, fastq_validator, tmp_path): - test_file = tmp_path.joinpath('test.fastq') + test_file = tmp_path.joinpath("test.fastq") with _open_output_file(test_file, False) as output: - output.write('ABCDEF') + output.write("ABCDEF") fastq_validator.validate_fastq_files_in_path([tmp_path], 2) @@ -91,56 +82,54 @@ def test_fastq_validator_bad_file(self, fastq_validator, tmp_path): assert fastq_validator.errors def test_fastq_validator_duplicate_file(self, fastq_validator, tmp_path): - for subdirectory in ['a', 'b']: + for subdirectory in ["a", "b"]: subdirectory_path = tmp_path.joinpath(subdirectory) subdirectory_path.mkdir() - with _open_output_file(subdirectory_path.joinpath('test.fastq'), - False) as output: + with _open_output_file(subdirectory_path.joinpath("test.fastq"), False) as output: output.write(_GOOD_RECORDS) fastq_validator.validate_fastq_files_in_path([tmp_path], 2) - assert "test.fastq has been found multiple times" in \ - fastq_validator.errors[0] + assert "test.fastq has been found multiple times" in fastq_validator.errors[0] def test_fastq_validator_io_error(self, fastq_validator, tmp_path): - fake_path = tmp_path.joinpath('does-not-exist.fastq') + fake_path = tmp_path.joinpath("does-not-exist.fastq") fastq_validator.validate_fastq_file(fake_path) assert "Unable to open" in fastq_validator.errors[0] def test_fastq_validator_line_1_good(self, fastq_validator): - result = fastq_validator.validate_fastq_record('@SEQ_ID', 0) + result = fastq_validator.validate_fastq_record("@SEQ_ID", 0) assert not result def test_fastq_validator_line_1_bad(self, fastq_validator): - result = fastq_validator.validate_fastq_record('*SEQ_ID', 0) + result = fastq_validator.validate_fastq_record("*SEQ_ID", 0) assert "does not begin with '@'" in result[0] def test_fastq_validator_line_1_empty(self, fastq_validator): - result = fastq_validator.validate_fastq_record('', 0) + result = fastq_validator.validate_fastq_record("", 0) assert "does not begin with '@'" in result[0] def test_fastq_validator_line_2_good(self, fastq_validator): - result = fastq_validator.validate_fastq_record('ACTGACTGACTGNNNN', 1) + result = fastq_validator.validate_fastq_record("ACTGACTGACTGNNNN", 1) assert not result def test_fastq_validator_line_2_bad(self, fastq_validator): - result = fastq_validator.validate_fastq_record('ACTGACT$ACTGNNNN', 1) + result = fastq_validator.validate_fastq_record("ACTGACT$ACTGNNNN", 1) assert "contains invalid character(s): $" in result[0] def test_fastq_validator_line_3_good(self, fastq_validator): - result = fastq_validator.validate_fastq_record('+SEQ_ID', 2) + result = fastq_validator.validate_fastq_record("+SEQ_ID", 2) assert not result def test_fastq_validator_line_3_bad(self, fastq_validator): - result = fastq_validator.validate_fastq_record('!SEQ_ID', 2) + result = fastq_validator.validate_fastq_record("!SEQ_ID", 2) assert "does not begin with '+'" in result[0] @@ -151,42 +140,41 @@ def test_fastq_validator_line_4_good(self, fastq_validator): assert not result def test_fastq_validator_line_4_bad(self, fastq_validator): - fastq_validator.validate_fastq_record('1234567', 1) - result = fastq_validator.validate_fastq_record('ABC !@#', 3) + fastq_validator.validate_fastq_record("1234567", 1) + result = fastq_validator.validate_fastq_record("ABC !@#", 3) assert 'contains invalid quality character(s): " "' in result[0] def test_fastq_validator_line_4_matching_length(self, fastq_validator): - fastq_validator.validate_fastq_record('1234567', 1) - result = fastq_validator.validate_fastq_record('ABCDEFG', 3) + fastq_validator.validate_fastq_record("1234567", 1) + result = fastq_validator.validate_fastq_record("ABCDEFG", 3) assert not result - def test_fastq_validator_line_4_mismatched_length(self, fastq_validator, - tmp_path): - fastq_validator.validate_fastq_record('123456789ABCDEF', 1) - fastq_validator.validate_fastq_record('ABC', 3) + def test_fastq_validator_line_4_mismatched_length(self, fastq_validator, tmp_path): + fastq_validator.validate_fastq_record("123456789ABCDEF", 1) + fastq_validator.validate_fastq_record("ABC", 3) - test_data = '''\ + test_data = """\ @A12345:123:A12BCDEFG:1:1234:1000:1234 1:N:0:NACTGACTGA+CTGACTGACT NACTGACTGA + #FFFFFFFF -''' +""" - new_file = tmp_path.joinpath('test.fastq') + new_file = tmp_path.joinpath("test.fastq") with _open_output_file(new_file, False) as output: output.write(test_data) fastq_validator.validate_fastq_file(new_file) - assert "contains 9 characters which does not match line 2's 10" in \ - fastq_validator.errors[0] + assert ( + "contains 9 characters which does not match line 2's 10" in fastq_validator.errors[0] + ) - def test_fastq_validator_record_counts_good(self, fastq_validator, - tmp_path): + def test_fastq_validator_record_counts_good(self, fastq_validator, tmp_path): for filename in [ - 'SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq', - 'SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq' + "SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq", + "SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq", ]: new_file = tmp_path.joinpath(filename) with _open_output_file(new_file, False) as output: @@ -196,15 +184,14 @@ def test_fastq_validator_record_counts_good(self, fastq_validator, assert not fastq_validator.errors - def test_fastq_validator_record_counts_bad(self, fastq_validator, - tmp_path): - with _open_output_file(tmp_path.joinpath( - 'SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq'), - False) as output: + def test_fastq_validator_record_counts_bad(self, fastq_validator, tmp_path): + with _open_output_file( + tmp_path.joinpath("SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq"), False + ) as output: output.write(_GOOD_RECORDS) - with _open_output_file(tmp_path.joinpath( - 'SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq'), - False) as output: + with _open_output_file( + tmp_path.joinpath("SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq"), False + ) as output: output.write(_GOOD_RECORDS) output.write(_GOOD_RECORDS) diff --git a/tests/test_gz_validator.py b/tests/test_gz_validator.py index 0c6c8b2..ad748c4 100644 --- a/tests/test_gz_validator.py +++ b/tests/test_gz_validator.py @@ -1,19 +1,24 @@ -from pathlib import Path -import zipfile import re +import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), ( - ('test_data/fake_snrnaseq_tree_good.zip', []), - ('test_data/fake_snrnaseq_tree_bad.zip', ['.*text2.txt.gz is not a valid gzipped file']), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_re_list"), + ( + ("test_data/fake_snrnaseq_tree_good.zip", []), + ("test_data/fake_snrnaseq_tree_bad.zip", [".*text2.txt.gz is not a valid gzipped file"]), + ), +) def test_gz_validator(test_data_fname, msg_re_list, tmp_path): from gz_validator import GZValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = GZValidator(tmp_path / test_data_path.stem, 'snRNAseq') + validator = GZValidator(tmp_path / test_data_path.stem, "snRNAseq") errors = validator.collect_errors(coreuse=4)[:] assert len(msg_re_list) == len(errors) for err_str, re_str in zip(errors, msg_re_list): diff --git a/tests/test_ome_tiff_validator.py b/tests/test_ome_tiff_validator.py index 09e2163..89ad198 100644 --- a/tests/test_ome_tiff_validator.py +++ b/tests/test_ome_tiff_validator.py @@ -1,20 +1,27 @@ -from pathlib import Path -import zipfile import re +import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), ( - ('test_data/codex_tree_ometiff_bad.zip', - ['.*tubhiswt_C0_bad.ome.tif is not a valid OME.TIFF file.*']), - ('test_data/codex_tree_ometiff_good.zip',[]), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_re_list"), + ( + ( + "test_data/codex_tree_ometiff_bad.zip", + [".*tubhiswt_C0_bad.ome.tif is not a valid OME.TIFF file.*"], + ), + ("test_data/codex_tree_ometiff_good.zip", []), + ), +) def test_ome_tiff_validator(test_data_fname, msg_re_list, tmp_path): from ome_tiff_validator import OmeTiffValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = OmeTiffValidator(tmp_path / test_data_path.stem, 'CODEX') + validator = OmeTiffValidator(tmp_path / test_data_path.stem, "CODEX") errors = validator.collect_errors(coreuse=4)[:] assert len(msg_re_list) == len(errors) for err_str, re_str in zip(errors, msg_re_list): diff --git a/tests/test_publication_validator.py b/tests/test_publication_validator.py index ebb768c..50c3a0e 100644 --- a/tests/test_publication_validator.py +++ b/tests/test_publication_validator.py @@ -1,34 +1,46 @@ -from pathlib import Path -import zipfile import re +import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), ( - ('test_data/publication_tree_good.zip', []), - ('test_data/publication_tree_good_complex.zip', []), - ('test_data/publication_tree_bad_complex.zip', - [ - 'expected data file data/vignette_12/A/0/325b936e-4132-45fe-8674-9abbde568be8 is absent', - 'expected data file data/vignette_12/A/0/9db02302-07d9-4c54-ad45-4578c4822cce is absent', - 'expected data file data/vignette_12/A/1/90b3667d-3ccc-4241-9227-fee578d41bac is absent', - ]), - ('test_data/publication_tree_bad_1.zip', ['vignettes not found or not a directory']), - ('test_data/publication_tree_bad_2.zip', ['Found a subdirectory in a vignette']), - ('test_data/publication_tree_bad_3.zip', ['A vignette has more than one markdown file']), - ('test_data/publication_tree_bad_4.zip', ['figure dict does not provide a name']), - ('test_data/publication_tree_bad_5.zip', ['figure dict does not reference a file']), - ('test_data/publication_tree_bad_6.zip', ['unexpected files in vignette.*']), - ('test_data/publication_tree_bad_7.zip', ['expected data file' - ' data/codeluppi_2018_nature_methods.molecules.h5ad.zarr' - ' is absent']), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_re_list"), + ( + ("test_data/publication_tree_good.zip", []), + ("test_data/publication_tree_good_complex.zip", []), + ( + "test_data/publication_tree_bad_complex.zip", + [ + "expected data file data/vignette_12/A/0/325b936e-4132-45fe-8674-9abbde568be8 is absent", # noqa: E501 + "expected data file data/vignette_12/A/0/9db02302-07d9-4c54-ad45-4578c4822cce is absent", # noqa: E501 + "expected data file data/vignette_12/A/1/90b3667d-3ccc-4241-9227-fee578d41bac is absent", # noqa: E501 + ], + ), + ("test_data/publication_tree_bad_1.zip", ["vignettes not found or not a directory"]), + ("test_data/publication_tree_bad_2.zip", ["Found a subdirectory in a vignette"]), + ("test_data/publication_tree_bad_3.zip", ["A vignette has more than one markdown file"]), + ("test_data/publication_tree_bad_4.zip", ["figure dict does not provide a name"]), + ("test_data/publication_tree_bad_5.zip", ["figure dict does not reference a file"]), + ("test_data/publication_tree_bad_6.zip", ["unexpected files in vignette.*"]), + ( + "test_data/publication_tree_bad_7.zip", + [ + "expected data file" + " data/codeluppi_2018_nature_methods.molecules.h5ad.zarr" + " is absent" + ], + ), + ), +) def test_publication_validator(test_data_fname, msg_re_list, tmp_path): from publication_validator import PublicationValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = PublicationValidator(tmp_path / test_data_path.stem, 'Publication') + validator = PublicationValidator(tmp_path / test_data_path.stem, "Publication") errors = validator.collect_errors(coreuse=4)[:] print(f"errors: {errors}") matched_err_str_list = [] diff --git a/tests/test_tiff_validator.py b/tests/test_tiff_validator.py index c2d271b..4ebe66c 100644 --- a/tests/test_tiff_validator.py +++ b/tests/test_tiff_validator.py @@ -1,24 +1,32 @@ -from pathlib import Path -import zipfile import re +import zipfile +from pathlib import Path import pytest -@pytest.mark.parametrize(('test_data_fname', 'msg_re_list'), ( - ('test_data/tiff_tree_good.zip', []), - ('test_data/tiff_tree_bad.zip', [ - '.*notatiff.tif is not a valid TIFF file', - '.*notatiff.tiff is not a valid TIFF file', - '.*notatiff.TIFF is not a valid TIFF file', - '.*notatiff.TIF is not a valid TIFF file', - ]), - )) + +@pytest.mark.parametrize( + ("test_data_fname", "msg_re_list"), + ( + ("test_data/tiff_tree_good.zip", []), + ( + "test_data/tiff_tree_bad.zip", + [ + ".*notatiff.tif is not a valid TIFF file", + ".*notatiff.tiff is not a valid TIFF file", + ".*notatiff.TIFF is not a valid TIFF file", + ".*notatiff.TIF is not a valid TIFF file", + ], + ), + ), +) def test_tiff_validator(test_data_fname, msg_re_list, tmp_path): from tiff_validator import TiffValidator + test_data_path = Path(test_data_fname) zfile = zipfile.ZipFile(test_data_path) zfile.extractall(tmp_path) - validator = TiffValidator(tmp_path / test_data_path.stem, 'codex') + validator = TiffValidator(tmp_path / test_data_path.stem, "codex") errors = validator.collect_errors(coreuse=4)[:] print(f"errors: {errors}") matched_err_str_list = []