merge

hubmapconsortium · Feb 26, 2024 · 62f4355 · 62f4355
2 parents f81cb3c + 39be208
commit 62f4355
Show file tree

Hide file tree

Showing 20 changed files with 391 additions and 248 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,3 @@
+[flake8]
+extend-ignore = E203,E501,W503
+max-line-length = 99
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -0,0 +1,32 @@
+name: Linters
+
+on:
+  push:
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.9", "3.10"]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install flake8 black isort
+      - name: Flake8 Lint
+        run: |
+          flake8 --ignore=E501,W503,E203 .
+      - name: Black Lint
+        run: |
+          black --line-length 99 --check --verbose .
+      - name: isort Lint
+        run: |
+          isort --profile black --check-only --diff .
diff --git a/.gitignore b/.gitignore
@@ -105,6 +105,7 @@ celerybeat.pid
 
 # Environments
 .env
+.envrc
 .venv
 env/
 venv/

diff --git a/README.md b/README.md
@@ -1,9 +1,58 @@
 # ingest-validation-tests
 
-This repository contains plug-in tests for use during validation of submissions.  It is referenced by ingest-validation-tools.
+This repository contains plug-in tests for use during validation of submissions. It is referenced by ingest-validation-tools.
 
 ## Development process
 
+### Branches
+
 - Make new feature branches from `devel`.
+- Before submitting a PR, make sure your code is black, isort, and flake8 compliant. Run the following from the base `ingest-validation-tests` directory:
+
+  ```
+  black --line-length 99 .
+  isort --profile black --multi-line 3 .
+  flake8
+  ```
+
+  (Integrating black and potentially isort/flake8 with your editor may allow you to skip this step, see Setup section below.)
+
 - Make PRs to `devel`. (This is the default branch.)
-- The last reviewer to approve a PR should merge it.  At the moment that is likely to be @jswelling .
+- The last reviewer to approve a PR should merge it.
+
+### Setup
+
+- Creating and activating a virtual environment is recommended. These instructions assume you are using a virtual environment. Example using venv:
+
+  ```
+  python3.9 -m venv hm-ingest-validation-tests
+  source hm-ingest-validation-tests/bin/activate
+  ```
+
+- Run `pip install -r requirements-dev.txt`
+- (optional) Integrate black with your editor.
+  - [Instructions for black.](https://black.readthedocs.io/en/stable/integrations/editors.html)
+- (optional) Integrate [isort](https://pycqa.github.io/isort/) with your editor.
+- (optional) Integrate [flake8](https://flake8.pycqa.org/en/latest/index.html) with your editor.
+
+### Testing
+
+- If ingest-validation-tools is not already set up:
+
+  ```
+  # Starting from ingest-validation-tests...
+  cd ..
+  git clone https://github.com/hubmapconsortium/ingest-validation-tools.git
+  cd ingest-validation-tests
+  pip install -r ../ingest-validation-tools/requirements.txt
+  pip install -r ../ingest-validation-tools/requirements-dev.txt
+  ```
+
+- If ingest-validation-tools is already set up, add the appropriate ingest-validation-tools path and run:
+
+  ```
+  pip install -r <path-to-ingest-validation-tools>/requirements.txt
+  pip install -r <path-to-ingest-validation-tools>/requirements-dev.txt
+  ```
+
+- Run `test.sh`
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,6 @@
+[tool.black]
+line-length = 99
+
+[tool.isort]
+profile = "black"
+multi_line_output = 3
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1,11 @@
+black==23.12.1
+flake8==7.0.0
+git+https://github.com/hubmapconsortium/fastq-utils.git@v0.2.5#egg=hubmap-fastq-utils
+imagecodecs>=2023.3.16
+isort==5.13.2
+jsonschema==4.4.0
+pandas>=1.2.0
+pytest==8.0.0
+python-frontmatter>=1.0.0
+tifffile==2020.10.1
+xmlschema>=1.6
diff --git a/src/ingest_validation_tests/codex_common_errors_validator.py b/src/ingest_validation_tests/codex_common_errors_validator.py
@@ -28,18 +28,12 @@ def _split_cycle_dir_string(cycle_str):
     try:
         cyc_id = int(words[0][len("cyc") :])
     except ValueError:
-        raise AssertionError(
-            f'Directory string "{cycle_str}" cycle number is' " not an integer"
-        )
-    assert words[1].startswith("reg"), (
-        f'Directory string "{cycle_str}" does' ' not include "_reg"'
-    )
+        raise AssertionError(f'Directory string "{cycle_str}" cycle number is' " not an integer")
+    assert words[1].startswith("reg"), f'Directory string "{cycle_str}" does' ' not include "_reg"'
     try:
         reg_id = int(words[1][len("reg") :])
     except ValueError:
-        raise AssertionError(
-            f'Directory string "{cycle_str}" region number is' " not an integer"
-        )
+        raise AssertionError(f'Directory string "{cycle_str}" region number is' " not an integer")
     return cyc_id, reg_id
 
 
@@ -86,9 +80,7 @@ def collect_errors(self, **kwargs) -> List[str]:
                     self._log("FOUND dataset.json; skipping further analysis")
                     raise QuitNowException()
                 elif any_dataset_json_exists:
-                    rslt.append(
-                        "A dataset.json file exists but" " is in the wrong place"
-                    )
+                    rslt.append("A dataset.json file exists but" " is in the wrong place")
 
                 # is the segmentation.json file on the right side?
                 found = False
@@ -102,9 +94,7 @@ def collect_errors(self, **kwargs) -> List[str]:
                     if right_place:
                         pass
                     else:
-                        rslt.append(
-                            "The segmentation.json file is in the wrong subdirectory"
-                        )
+                        rslt.append("The segmentation.json file is in the wrong subdirectory")
                 else:
                     rslt.append("The segmentation.json file is missing or misplaced")
 
@@ -145,8 +135,7 @@ def collect_errors(self, **kwargs) -> List[str]:
                             raise QuitNowException()
                     if len(rpt_df.columns) != 2:
                         rslt.append(
-                            f"Could not parse {report_csv_path}."
-                            " Is it a comma-separated table?"
+                            f"Could not parse {report_csv_path}." " Is it a comma-separated table?"
                         )
                         raise QuitNowException()
                     col_0, col_1 = rpt_df.columns
@@ -209,9 +198,7 @@ def collect_errors(self, **kwargs) -> List[str]:
                 # excluding any HandE channels
                 total_channel_count = len(cn_df)
                 h_and_e_channel_count = len(cn_df[cn_df[0].str.startswith("HandE")])
-                channels_per_cycle = (
-                    total_channel_count - h_and_e_channel_count
-                ) / len(cycles)
+                channels_per_cycle = (total_channel_count - h_and_e_channel_count) / len(cycles)
                 if channels_per_cycle != int(channels_per_cycle):
                     failures.append("The number of channels per cycle is not constant")
                 if failures:

diff --git a/src/ingest_validation_tests/codex_json_validator.py b/src/ingest_validation_tests/codex_json_validator.py
@@ -12,19 +12,19 @@ class CodexJsonValidator(Validator):
 
     def collect_errors(self, **kwargs) -> List[str]:
         del kwargs
-        if 'codex' not in self.assay_type.lower():
+        if "codex" not in self.assay_type.lower():
             return []
 
-        schema_path = Path(__file__).parent / 'codex_schema.json'
+        schema_path = Path(__file__).parent / "codex_schema.json"
         schema = json.loads(schema_path.read_text())
 
         rslt = []
-        for glob_expr in ['**/dataset.json']:
+        for glob_expr in ["**/dataset.json"]:
             for path in self.paths:
                 for file in path.glob(glob_expr):
                     instance = json.loads(file.read_text())
                     try:
                         validate(instance=instance, schema=schema)
                     except Exception as e:
-                        rslt.append(f'{file}: {e}')
+                        rslt.append(f"{file}: {e}")
         return rslt
diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py
@@ -85,9 +85,7 @@ def _validate_fastq_line_2(self, line: str) -> List[str]:
         self._line_2_length = len(line)
         self._last_line_2_number = self._line_number
 
-        invalid_chars = "".join(
-            c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS
-        )
+        invalid_chars = "".join(c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS)
         if invalid_chars:
             return [f"Line contains invalid character(s): {invalid_chars}"]
 
@@ -103,9 +101,7 @@ def _validate_fastq_line_4(self, line: str) -> List[str]:
         errors: List[str] = []
         invalid_chars = "".join(c for c in line if not 33 <= ord(c) <= 126)
         if invalid_chars:
-            errors.append(
-                "Line contains invalid quality character(s): " f'"{invalid_chars}"'
-            )
+            errors.append("Line contains invalid quality character(s): " f'"{invalid_chars}"')
 
         if len(line) != self._line_2_length:
             errors.append(
@@ -129,9 +125,7 @@ def validate_fastq_record(self, line: str, line_number: int) -> List[str]:
             self._VALIDATE_FASTQ_LINE_METHODS[line_index]
         )
 
-        assert (
-            validator_method
-        ), f"No validator method defined for record index {line_index}"
+        assert validator_method, f"No validator method defined for record index {line_index}"
 
         return validator_method(self, line)
 
@@ -155,9 +149,7 @@ def validate_fastq_file(self, fastq_file: Path) -> None:
 
         if not is_valid_filename(fastq_file.name):
             # If we don't like the filename, don't bother reading the contents.
-            self.errors.append(
-                "Filename does not have proper format " "and will not be processed"
-            )
+            self.errors.append("Filename does not have proper format " "and will not be processed")
             return
 
         self._line_number = 0
@@ -170,9 +162,7 @@ def validate_fastq_file(self, fastq_file: Path) -> None:
             self.errors.append(self._format_error(f"Bad gzip file: {fastq_file}."))
             return
         except IOError:
-            self.errors.append(
-                self._format_error(f"Unable to open FASTQ data file {fastq_file}.")
-            )
+            self.errors.append(self._format_error(f"Unable to open FASTQ data file {fastq_file}."))
             return
         self._file_record_counts[str(fastq_file)] = records_read
 
@@ -272,9 +262,7 @@ def main():
     elif isinstance(args.filepaths, str):
         filepaths = [Path(args.filepaths)]
     else:
-        raise Exception(
-            f"Validator init received base_paths arg as type {type(args.filepaths)}"
-        )
+        raise Exception(f"Validator init received base_paths arg as type {type(args.filepaths)}")
 
     validator = FASTQValidatorLogic(True)
     validator.validate_fastq_files_in_path(filepaths, Lock())

diff --git a/src/ingest_validation_tests/gz_validator.py b/src/ingest_validation_tests/gz_validator.py
@@ -1,9 +1,9 @@
+import gzip
+import re
 from multiprocessing import Pool
 from os import cpu_count
-import re
 from typing import List
 
-import gzip
 from ingest_validation_tools.plugin_validator import Validator
 
 

diff --git a/src/ingest_validation_tests/publication_validator.py b/src/ingest_validation_tests/publication_validator.py
@@ -2,10 +2,11 @@
 Test for some common errors in the directory and file structure of publications.
 """
 
-from typing import List
-import re
 import json
+import re
 from pathlib import Path
+from typing import List
+
 import frontmatter
 from ingest_validation_tools.plugin_validator import Validator
 
@@ -15,50 +16,57 @@ class PublicationValidator(Validator):
     Test for some common errors in the directory and file structure of
     publications.
     """
+
     description = "Test for common problems found in publications"
     cost = 1.0
-    base_url_re = r'(\s*\{\{\s*base_url\s*\}\})/(.*)'
-    url_re = r'[Uu][Rr][Ll]'
+    base_url_re = r"(\s*\{\{\s*base_url\s*\}\})/(.*)"
+    url_re = r"[Uu][Rr][Ll]"
 
     def collect_errors(self, **kwargs) -> List[str]:
         """
         Return the errors found by this validator
         """
         del kwargs
-        if self.assay_type != 'Publication':
+        if self.assay_type != "Publication":
             return []  # We only test Publication data
         rslt = []
         for path in self.paths:
             try:
-                vignette_path = path / 'vignettes'
-                assert vignette_path.is_dir(), 'vignettes not found or not a directory'
-                for this_vignette_path in vignette_path.glob('*'):
-                    assert this_vignette_path.is_dir(), (f"Found the non-dir {this_vignette_path}"
-                                                        " in vignettes")
-                    this_vignette_all_paths = set(this_vignette_path.glob('*'))
+                vignette_path = path / "vignettes"
+                assert vignette_path.is_dir(), "vignettes not found or not a directory"
+                for this_vignette_path in vignette_path.glob("*"):
+                    assert this_vignette_path.is_dir(), (
+                        f"Found the non-dir {this_vignette_path}" " in vignettes"
+                    )
+                    this_vignette_all_paths = set(this_vignette_path.glob("*"))
                     if not all(pth.is_file() for pth in this_vignette_all_paths):
-                        raise AssertionError('Found a subdirectory in a vignette')
+                        raise AssertionError("Found a subdirectory in a vignette")
                     md_found = False
                     vig_figures = []
-                    for md_path in this_vignette_path.glob('*.md'):
+                    for md_path in this_vignette_path.glob("*.md"):
                         if md_found:
-                            raise AssertionError('A vignette has more than one markdown file')
+                            raise AssertionError("A vignette has more than one markdown file")
                         else:
                             md_found = True
                         vig_fm = frontmatter.loads(md_path.read_text())
-                        for key in ['name', 'figures']:
-                            assert key in vig_fm.metadata, ('vignette markdown is incorrectly'
-                                                            f' formatted or has no {key}')
-                        for fig_dict in vig_fm.metadata['figures']:
-                            assert 'file' in fig_dict, 'figure dict does not reference a file'
-                            assert 'name' in fig_dict, 'figure dict does not provide a name'
-                            vig_figures.append(fig_dict['file'])
+                        for key in ["name", "figures"]:
+                            assert key in vig_fm.metadata, (
+                                "vignette markdown is incorrectly" f" formatted or has no {key}"
+                            )
+                        for fig_dict in vig_fm.metadata["figures"]:
+                            assert "file" in fig_dict, "figure dict does not reference a file"
+                            assert "name" in fig_dict, "figure dict does not provide a name"
+                            vig_figures.append(fig_dict["file"])
                         this_vignette_all_paths.remove(md_path)
                         for fname in vig_figures:
-                            rslt.extend(self.validate_vitessce_config(this_vignette_path / fname, path))
+                            rslt.extend(
+                                self.validate_vitessce_config(this_vignette_path / fname, path)
+                            )
                             this_vignette_all_paths.remove(this_vignette_path / fname)
-                    assert not this_vignette_all_paths, ('unexpected files in vignette:'
-                                                        f' {list(str(elt) for elt in this_vignette_all_paths)}')
+                    assert not this_vignette_all_paths, (
+                        "unexpected files in vignette:"
+                        f" {list(str(elt) for elt in this_vignette_all_paths)}"
+                    )
 
             except AssertionError as excp:
                 rslt.append(str(excp))
@@ -94,9 +102,10 @@ def validate_vitessce_config(self, json_path, path):
                     match = re.match(self.base_url_re, val)
                     if match:  # it starts with {{ base_url }}
                         extra_url = match.group(2)
-                        data_path = path / 'data' / extra_url
-                        assert data_path.exists(), ("expected data file"
-                                                    f" {Path('data') / extra_url} is absent")
+                        data_path = path / "data" / extra_url
+                        assert data_path.exists(), (
+                            "expected data file" f" {Path('data') / extra_url} is absent"
+                        )
 
                 except AssertionError as excp:
                     rslt.append(str(excp))