Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
gesinaphillips committed Feb 26, 2024
2 parents f81cb3c + 39be208 commit 62f4355
Show file tree
Hide file tree
Showing 20 changed files with 391 additions and 248 deletions.
3 changes: 3 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[flake8]
extend-ignore = E203,E501,W503
max-line-length = 99
32 changes: 32 additions & 0 deletions .github/workflows/linting.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: Linters

on:
push:
pull_request:
workflow_dispatch:

jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10"]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 black isort
- name: Flake8 Lint
run: |
flake8 --ignore=E501,W503,E203 .
- name: Black Lint
run: |
black --line-length 99 --check --verbose .
- name: isort Lint
run: |
isort --profile black --check-only --diff .
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ celerybeat.pid

# Environments
.env
.envrc
.venv
env/
venv/
Expand Down
53 changes: 51 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,58 @@
# ingest-validation-tests

This repository contains plug-in tests for use during validation of submissions. It is referenced by ingest-validation-tools.
This repository contains plug-in tests for use during validation of submissions. It is referenced by ingest-validation-tools.

## Development process

### Branches

- Make new feature branches from `devel`.
- Before submitting a PR, make sure your code is black, isort, and flake8 compliant. Run the following from the base `ingest-validation-tests` directory:

```
black --line-length 99 .
isort --profile black --multi-line 3 .
flake8
```

(Integrating black and potentially isort/flake8 with your editor may allow you to skip this step, see Setup section below.)

- Make PRs to `devel`. (This is the default branch.)
- The last reviewer to approve a PR should merge it. At the moment that is likely to be @jswelling .
- The last reviewer to approve a PR should merge it.

### Setup

- Creating and activating a virtual environment is recommended. These instructions assume you are using a virtual environment. Example using venv:

```
python3.9 -m venv hm-ingest-validation-tests
source hm-ingest-validation-tests/bin/activate
```

- Run `pip install -r requirements-dev.txt`
- (optional) Integrate black with your editor.
- [Instructions for black.](https://black.readthedocs.io/en/stable/integrations/editors.html)
- (optional) Integrate [isort](https://pycqa.github.io/isort/) with your editor.
- (optional) Integrate [flake8](https://flake8.pycqa.org/en/latest/index.html) with your editor.

### Testing

- If ingest-validation-tools is not already set up:

```
# Starting from ingest-validation-tests...
cd ..
git clone https://github.com/hubmapconsortium/ingest-validation-tools.git
cd ingest-validation-tests
pip install -r ../ingest-validation-tools/requirements.txt
pip install -r ../ingest-validation-tools/requirements-dev.txt
```

- If ingest-validation-tools is already set up, add the appropriate ingest-validation-tools path and run:

```
pip install -r <path-to-ingest-validation-tools>/requirements.txt
pip install -r <path-to-ingest-validation-tools>/requirements-dev.txt
```

- Run `test.sh`
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[tool.black]
line-length = 99

[tool.isort]
profile = "black"
multi_line_output = 3
11 changes: 11 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
black==23.12.1
flake8==7.0.0
git+https://github.com/hubmapconsortium/fastq-utils.git@v0.2.5#egg=hubmap-fastq-utils
imagecodecs>=2023.3.16
isort==5.13.2
jsonschema==4.4.0
pandas>=1.2.0
pytest==8.0.0
python-frontmatter>=1.0.0
tifffile==2020.10.1
xmlschema>=1.6
27 changes: 7 additions & 20 deletions src/ingest_validation_tests/codex_common_errors_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,12 @@ def _split_cycle_dir_string(cycle_str):
try:
cyc_id = int(words[0][len("cyc") :])
except ValueError:
raise AssertionError(
f'Directory string "{cycle_str}" cycle number is' " not an integer"
)
assert words[1].startswith("reg"), (
f'Directory string "{cycle_str}" does' ' not include "_reg"'
)
raise AssertionError(f'Directory string "{cycle_str}" cycle number is' " not an integer")
assert words[1].startswith("reg"), f'Directory string "{cycle_str}" does' ' not include "_reg"'
try:
reg_id = int(words[1][len("reg") :])
except ValueError:
raise AssertionError(
f'Directory string "{cycle_str}" region number is' " not an integer"
)
raise AssertionError(f'Directory string "{cycle_str}" region number is' " not an integer")
return cyc_id, reg_id


Expand Down Expand Up @@ -86,9 +80,7 @@ def collect_errors(self, **kwargs) -> List[str]:
self._log("FOUND dataset.json; skipping further analysis")
raise QuitNowException()
elif any_dataset_json_exists:
rslt.append(
"A dataset.json file exists but" " is in the wrong place"
)
rslt.append("A dataset.json file exists but" " is in the wrong place")

# is the segmentation.json file on the right side?
found = False
Expand All @@ -102,9 +94,7 @@ def collect_errors(self, **kwargs) -> List[str]:
if right_place:
pass
else:
rslt.append(
"The segmentation.json file is in the wrong subdirectory"
)
rslt.append("The segmentation.json file is in the wrong subdirectory")
else:
rslt.append("The segmentation.json file is missing or misplaced")

Expand Down Expand Up @@ -145,8 +135,7 @@ def collect_errors(self, **kwargs) -> List[str]:
raise QuitNowException()
if len(rpt_df.columns) != 2:
rslt.append(
f"Could not parse {report_csv_path}."
" Is it a comma-separated table?"
f"Could not parse {report_csv_path}." " Is it a comma-separated table?"
)
raise QuitNowException()
col_0, col_1 = rpt_df.columns
Expand Down Expand Up @@ -209,9 +198,7 @@ def collect_errors(self, **kwargs) -> List[str]:
# excluding any HandE channels
total_channel_count = len(cn_df)
h_and_e_channel_count = len(cn_df[cn_df[0].str.startswith("HandE")])
channels_per_cycle = (
total_channel_count - h_and_e_channel_count
) / len(cycles)
channels_per_cycle = (total_channel_count - h_and_e_channel_count) / len(cycles)
if channels_per_cycle != int(channels_per_cycle):
failures.append("The number of channels per cycle is not constant")
if failures:
Expand Down
8 changes: 4 additions & 4 deletions src/ingest_validation_tests/codex_json_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,19 @@ class CodexJsonValidator(Validator):

def collect_errors(self, **kwargs) -> List[str]:
del kwargs
if 'codex' not in self.assay_type.lower():
if "codex" not in self.assay_type.lower():
return []

schema_path = Path(__file__).parent / 'codex_schema.json'
schema_path = Path(__file__).parent / "codex_schema.json"
schema = json.loads(schema_path.read_text())

rslt = []
for glob_expr in ['**/dataset.json']:
for glob_expr in ["**/dataset.json"]:
for path in self.paths:
for file in path.glob(glob_expr):
instance = json.loads(file.read_text())
try:
validate(instance=instance, schema=schema)
except Exception as e:
rslt.append(f'{file}: {e}')
rslt.append(f"{file}: {e}")
return rslt
24 changes: 6 additions & 18 deletions src/ingest_validation_tests/fastq_validator_logic.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,7 @@ def _validate_fastq_line_2(self, line: str) -> List[str]:
self._line_2_length = len(line)
self._last_line_2_number = self._line_number

invalid_chars = "".join(
c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS
)
invalid_chars = "".join(c for c in line if c not in self._FASTQ_LINE_2_VALID_CHARS)
if invalid_chars:
return [f"Line contains invalid character(s): {invalid_chars}"]

Expand All @@ -103,9 +101,7 @@ def _validate_fastq_line_4(self, line: str) -> List[str]:
errors: List[str] = []
invalid_chars = "".join(c for c in line if not 33 <= ord(c) <= 126)
if invalid_chars:
errors.append(
"Line contains invalid quality character(s): " f'"{invalid_chars}"'
)
errors.append("Line contains invalid quality character(s): " f'"{invalid_chars}"')

if len(line) != self._line_2_length:
errors.append(
Expand All @@ -129,9 +125,7 @@ def validate_fastq_record(self, line: str, line_number: int) -> List[str]:
self._VALIDATE_FASTQ_LINE_METHODS[line_index]
)

assert (
validator_method
), f"No validator method defined for record index {line_index}"
assert validator_method, f"No validator method defined for record index {line_index}"

return validator_method(self, line)

Expand All @@ -155,9 +149,7 @@ def validate_fastq_file(self, fastq_file: Path) -> None:

if not is_valid_filename(fastq_file.name):
# If we don't like the filename, don't bother reading the contents.
self.errors.append(
"Filename does not have proper format " "and will not be processed"
)
self.errors.append("Filename does not have proper format " "and will not be processed")
return

self._line_number = 0
Expand All @@ -170,9 +162,7 @@ def validate_fastq_file(self, fastq_file: Path) -> None:
self.errors.append(self._format_error(f"Bad gzip file: {fastq_file}."))
return
except IOError:
self.errors.append(
self._format_error(f"Unable to open FASTQ data file {fastq_file}.")
)
self.errors.append(self._format_error(f"Unable to open FASTQ data file {fastq_file}."))
return
self._file_record_counts[str(fastq_file)] = records_read

Expand Down Expand Up @@ -272,9 +262,7 @@ def main():
elif isinstance(args.filepaths, str):
filepaths = [Path(args.filepaths)]
else:
raise Exception(
f"Validator init received base_paths arg as type {type(args.filepaths)}"
)
raise Exception(f"Validator init received base_paths arg as type {type(args.filepaths)}")

validator = FASTQValidatorLogic(True)
validator.validate_fastq_files_in_path(filepaths, Lock())
Expand Down
4 changes: 2 additions & 2 deletions src/ingest_validation_tests/gz_validator.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import gzip
import re
from multiprocessing import Pool
from os import cpu_count
import re
from typing import List

import gzip
from ingest_validation_tools.plugin_validator import Validator


Expand Down
63 changes: 36 additions & 27 deletions src/ingest_validation_tests/publication_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
Test for some common errors in the directory and file structure of publications.
"""

from typing import List
import re
import json
import re
from pathlib import Path
from typing import List

import frontmatter
from ingest_validation_tools.plugin_validator import Validator

Expand All @@ -15,50 +16,57 @@ class PublicationValidator(Validator):
Test for some common errors in the directory and file structure of
publications.
"""

description = "Test for common problems found in publications"
cost = 1.0
base_url_re = r'(\s*\{\{\s*base_url\s*\}\})/(.*)'
url_re = r'[Uu][Rr][Ll]'
base_url_re = r"(\s*\{\{\s*base_url\s*\}\})/(.*)"
url_re = r"[Uu][Rr][Ll]"

def collect_errors(self, **kwargs) -> List[str]:
"""
Return the errors found by this validator
"""
del kwargs
if self.assay_type != 'Publication':
if self.assay_type != "Publication":
return [] # We only test Publication data
rslt = []
for path in self.paths:
try:
vignette_path = path / 'vignettes'
assert vignette_path.is_dir(), 'vignettes not found or not a directory'
for this_vignette_path in vignette_path.glob('*'):
assert this_vignette_path.is_dir(), (f"Found the non-dir {this_vignette_path}"
" in vignettes")
this_vignette_all_paths = set(this_vignette_path.glob('*'))
vignette_path = path / "vignettes"
assert vignette_path.is_dir(), "vignettes not found or not a directory"
for this_vignette_path in vignette_path.glob("*"):
assert this_vignette_path.is_dir(), (
f"Found the non-dir {this_vignette_path}" " in vignettes"
)
this_vignette_all_paths = set(this_vignette_path.glob("*"))
if not all(pth.is_file() for pth in this_vignette_all_paths):
raise AssertionError('Found a subdirectory in a vignette')
raise AssertionError("Found a subdirectory in a vignette")
md_found = False
vig_figures = []
for md_path in this_vignette_path.glob('*.md'):
for md_path in this_vignette_path.glob("*.md"):
if md_found:
raise AssertionError('A vignette has more than one markdown file')
raise AssertionError("A vignette has more than one markdown file")
else:
md_found = True
vig_fm = frontmatter.loads(md_path.read_text())
for key in ['name', 'figures']:
assert key in vig_fm.metadata, ('vignette markdown is incorrectly'
f' formatted or has no {key}')
for fig_dict in vig_fm.metadata['figures']:
assert 'file' in fig_dict, 'figure dict does not reference a file'
assert 'name' in fig_dict, 'figure dict does not provide a name'
vig_figures.append(fig_dict['file'])
for key in ["name", "figures"]:
assert key in vig_fm.metadata, (
"vignette markdown is incorrectly" f" formatted or has no {key}"
)
for fig_dict in vig_fm.metadata["figures"]:
assert "file" in fig_dict, "figure dict does not reference a file"
assert "name" in fig_dict, "figure dict does not provide a name"
vig_figures.append(fig_dict["file"])
this_vignette_all_paths.remove(md_path)
for fname in vig_figures:
rslt.extend(self.validate_vitessce_config(this_vignette_path / fname, path))
rslt.extend(
self.validate_vitessce_config(this_vignette_path / fname, path)
)
this_vignette_all_paths.remove(this_vignette_path / fname)
assert not this_vignette_all_paths, ('unexpected files in vignette:'
f' {list(str(elt) for elt in this_vignette_all_paths)}')
assert not this_vignette_all_paths, (
"unexpected files in vignette:"
f" {list(str(elt) for elt in this_vignette_all_paths)}"
)

except AssertionError as excp:
rslt.append(str(excp))
Expand Down Expand Up @@ -94,9 +102,10 @@ def validate_vitessce_config(self, json_path, path):
match = re.match(self.base_url_re, val)
if match: # it starts with {{ base_url }}
extra_url = match.group(2)
data_path = path / 'data' / extra_url
assert data_path.exists(), ("expected data file"
f" {Path('data') / extra_url} is absent")
data_path = path / "data" / extra_url
assert data_path.exists(), (
"expected data file" f" {Path('data') / extra_url} is absent"
)

except AssertionError as excp:
rslt.append(str(excp))
Expand Down
Loading

0 comments on commit 62f4355

Please sign in to comment.