From 499d701f97b6a415ba189116e98ff4993ac804d0 Mon Sep 17 00:00:00 2001 From: Nick Moore Date: Tue, 4 Jul 2023 15:27:31 +1000 Subject: [PATCH 1/7] Improvement to performance of fastqread --- src/fqfa/fastq/fastqread.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/fqfa/fastq/fastqread.py b/src/fqfa/fastq/fastqread.py index 106a07c..632a4da 100644 --- a/src/fqfa/fastq/fastqread.py +++ b/src/fqfa/fastq/fastqread.py @@ -4,7 +4,6 @@ from dataclasses import dataclass, field, InitVar from typing import List, Optional, ClassVar, Callable, Match -from statistics import mean from fqfa.util.nucleotide import reverse_complement from fqfa.validator.create import create_validator from fqfa.constants.iupac.dna import DNA_BASES @@ -99,7 +98,10 @@ def __post_init__(self, quality_string: str) -> None: # mypy false positive: https://github.com/python/mypy/issues/5485 raise ValueError("unexpected characters in sequence") - self.quality = [ord(c) - self.quality_encoding_value for c in quality_string] + quality_string_bytes = quality_string.encode('ascii') + qev = self.quality_encoding_value + self.quality = [qsb - qev for qsb in quality_string_bytes] + if min(self.quality) < 0: raise ValueError("sequence quality value below 0") if max(self.quality) > 93: @@ -139,7 +141,7 @@ def average_quality(self) -> float: Mean quality value. """ - return mean(self.quality) + return sum(self.quality)/len(self.quality) def min_quality(self) -> int: """Calculates and returns the read's minimum quality value. From ba623af6dc06dcc3ffa8ab9119f66fcf326f4b3c Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Sat, 26 Aug 2023 14:23:51 +1000 Subject: [PATCH 2/7] add mypy type checking to actions --- .github/workflows/python-package.yml | 7 +++++-- src/fqfa/fastq/fastq.py | 9 +++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 28f0840..88d6cd6 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -27,7 +27,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install flake8 pytest + python -m pip install flake8 pytest mypy if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Install package run: | @@ -40,4 +40,7 @@ jobs: flake8 . --count --exit-zero --statistics - name: Test with pytest run: | - pytest + python -m pytest + - name: Type check with mypy + run: | + python -m mypy src/ diff --git a/src/fqfa/fastq/fastq.py b/src/fqfa/fastq/fastq.py index af5037c..1d2b294 100644 --- a/src/fqfa/fastq/fastq.py +++ b/src/fqfa/fastq/fastq.py @@ -37,7 +37,8 @@ def parse_fastq_reads(handle: TextIO) -> Generator[FastqRead, None, None]: raise ValueError("incomplete FASTQ record") else: lines = [x.rstrip() for x in lines] # remove trailing newlines - yield FastqRead(*lines) + # TODO: figure out why mypy doesn't like this + yield FastqRead(*lines) # type: ignore[arg-type] def parse_fastq_pe_reads( @@ -80,9 +81,9 @@ def parse_fastq_pe_reads( for fwd, rev in zip_longest(fwd_generator, rev_generator, fillvalue=None): if None in (fwd, rev): raise ValueError("mismatched FASTQ file lengths") - elif fwd.header.split()[0] != rev.header.split()[0]: + elif fwd.header.split()[0] != rev.header.split()[0]: # type: ignore[union-attr] raise ValueError("forward and reverse read headers do not match") else: if revcomp: - rev.reverse_complement() - yield fwd, rev + rev.reverse_complement() # type: ignore[union-attr] + yield fwd, rev # type: ignore[misc] From 89f1fc9f598df73e31a268070b7129485a2fa71d Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Tue, 29 Aug 2023 15:09:40 +1000 Subject: [PATCH 3/7] update black settings --- .flake8 | 3 +- pyproject.toml | 3 ++ src/fqfa/fastq/fastqread.py | 8 ++---- src/fqfa/util/infer.py | 4 +-- src/fqfa/util/translate.py | 17 ++--------- tests/test_fastqread.py | 52 +++++++++------------------------- tests/test_util_infer.py | 28 +++++------------- tests/test_validator_create.py | 8 ++---- 8 files changed, 32 insertions(+), 91 deletions(-) diff --git a/.flake8 b/.flake8 index 3237997..cea1085 100644 --- a/.flake8 +++ b/.flake8 @@ -1,4 +1,3 @@ [flake8] -extend-ignore = E203 -max-line-length = 88 +extend-ignore = E203, E501 max-complexity = 10 diff --git a/pyproject.toml b/pyproject.toml index ca9bd2e..4b058c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,9 @@ dev = [ "pytest", ] +[tool.black] +line-length = 120 + [tool.hatch.version] path = "src/fqfa/__init__.py" diff --git a/src/fqfa/fastq/fastqread.py b/src/fqfa/fastq/fastqread.py index 106a07c..587eed7 100644 --- a/src/fqfa/fastq/fastqread.py +++ b/src/fqfa/fastq/fastqread.py @@ -52,9 +52,7 @@ class FastqRead: quality: List[int] = field(init=False) quality_string: InitVar[str] quality_encoding_value: int = 33 - _sequence_validator: ClassVar[ - Callable[[str], Optional[Match[str]]] - ] = create_validator(DNA_BASES + ["N"]) + _sequence_validator: ClassVar[Callable[[str], Optional[Match[str]]]] = create_validator(DNA_BASES + ["N"]) def __post_init__(self, quality_string: str) -> None: """Perform some basic checks on the input and converts the quality string into a @@ -125,9 +123,7 @@ def __str__(self) -> str: Reconstruction of the original FASTQ record. """ - quality_string = "".join( - [chr(q + self.quality_encoding_value) for q in self.quality] - ) + quality_string = "".join([chr(q + self.quality_encoding_value) for q in self.quality]) return "\n".join((self.header, self.sequence, self.header2, quality_string)) def average_quality(self) -> float: diff --git a/src/fqfa/util/infer.py b/src/fqfa/util/infer.py index f71a7eb..f7f447d 100644 --- a/src/fqfa/util/infer.py +++ b/src/fqfa/util/infer.py @@ -59,9 +59,7 @@ def infer_sequence_type(seq: str, report_iupac: bool = True) -> Optional[str]: return None -def infer_all_sequence_types( # noqa: max-complexity: 11 - seq: str, report_iupac: bool = True -) -> Optional[List[str]]: +def infer_all_sequence_types(seq: str, report_iupac: bool = True) -> Optional[List[str]]: # noqa: max-complexity: 11 """Return all inferred types for the given sequence. Sequence types include: diff --git a/src/fqfa/util/translate.py b/src/fqfa/util/translate.py index cd6f979..abf512d 100644 --- a/src/fqfa/util/translate.py +++ b/src/fqfa/util/translate.py @@ -9,9 +9,7 @@ __all__ = ["translate_dna", "ncbi_genetic_code_to_dict"] -def translate_dna( - seq: str, table: Optional[Dict[str, str]] = None, frame: int = 0 -) -> Tuple[str, Optional[str]]: +def translate_dna(seq: str, table: Optional[Dict[str, str]] = None, frame: int = 0) -> Tuple[str, Optional[str]]: """ Translate a DNA sequence into the corresponding amino acid sequence. @@ -116,11 +114,7 @@ def ncbi_genetic_code_to_dict( # noqa: max-complexity: 11 If the AAs row contains a character other than an amino acid. """ - lines = [ - s.strip() - for s in ncbi_string.split("\n") - if len(s) > 0 and not s.startswith("#") and not s.isspace() - ] + lines = [s.strip() for s in ncbi_string.split("\n") if len(s) > 0 and not s.startswith("#") and not s.isspace()] if len(lines) != 5: raise ValueError("transl_table string must have 5 lines") @@ -152,12 +146,7 @@ def ncbi_genetic_code_to_dict( # noqa: max-complexity: 11 codon_dict: Dict[str, str] = dict() for aa, codon in zip( transl_table["AAs"], - ( - "".join(nts) - for nts in zip( - transl_table["Base1"], transl_table["Base2"], transl_table["Base3"] - ) - ), + ("".join(nts) for nts in zip(transl_table["Base1"], transl_table["Base2"], transl_table["Base3"])), ): if codon not in codon_dict: codon_dict[codon] = aa diff --git a/tests/test_fastqread.py b/tests/test_fastqread.py index 0f0569e..3d34139 100644 --- a/tests/test_fastqread.py +++ b/tests/test_fastqread.py @@ -22,9 +22,7 @@ def test_creation_no_errors(self) -> None: self.assertEqual(test_read.sequence, self.test_kwargs["sequence"]) self.assertEqual(test_read.header2, self.test_kwargs["header2"]) self.assertListEqual(test_read.quality, self.test_quality) - self.assertEqual( - test_read.quality_encoding_value, self.test_kwargs["quality_encoding_value"] - ) + self.assertEqual(test_read.quality_encoding_value, self.test_kwargs["quality_encoding_value"]) def test_creation_bad_header(self) -> None: test_kwargs = self.test_kwargs.copy() @@ -55,9 +53,7 @@ def test_creation_bad_bases(self) -> None: # bad internal base/ambiguity character test_kwargs = self.test_kwargs.copy() - test_kwargs["sequence"] = ( - self.test_kwargs["sequence"][:1] + "W" + self.test_kwargs["sequence"][2:] - ) + test_kwargs["sequence"] = self.test_kwargs["sequence"][:1] + "W" + self.test_kwargs["sequence"][2:] self.assertRaises(ValueError, FastqRead, **test_kwargs) # bad last base/number @@ -74,9 +70,7 @@ def test_creation_bad_quality(self) -> None: # bad internal value/unicode character test_kwargs = self.test_kwargs.copy() test_kwargs["quality_string"] = ( - self.test_kwargs["quality_string"][:1] - + "µ" - + self.test_kwargs["quality_string"][2:] + self.test_kwargs["quality_string"][:1] + "µ" + self.test_kwargs["quality_string"][2:] ) self.assertRaises(ValueError, FastqRead, **test_kwargs) @@ -117,9 +111,7 @@ def test_trim_start(self) -> None: test_read.trim(start=len(test_read)) self.assertEqual(len(test_read), 1) self.assertEqual(len(test_read.sequence), len(test_read.quality)) - self.assertEqual( - test_read.sequence, FastqRead(**self.test_kwargs).sequence[-1:] - ) + self.assertEqual(test_read.sequence, FastqRead(**self.test_kwargs).sequence[-1:]) def test_trim_end(self) -> None: test_read = FastqRead(**self.test_kwargs) @@ -130,9 +122,7 @@ def test_trim_end(self) -> None: test_read.trim(end=len(test_read) - 1) self.assertEqual(len(test_read), len(FastqRead(**self.test_kwargs)) - 1) self.assertEqual(len(test_read.sequence), len(test_read.quality)) - self.assertEqual( - test_read.sequence, FastqRead(**self.test_kwargs).sequence[:-1] - ) + self.assertEqual(test_read.sequence, FastqRead(**self.test_kwargs).sequence[:-1]) test_read = FastqRead(**self.test_kwargs) test_read.trim(end=1) @@ -145,17 +135,13 @@ def test_trim_both_ends(self) -> None: test_read.trim(start=2, end=2) self.assertEqual(len(test_read), 1) self.assertEqual(len(test_read.sequence), len(test_read.quality)) - self.assertEqual( - test_read.sequence, FastqRead(**self.test_kwargs).sequence[1:2] - ) + self.assertEqual(test_read.sequence, FastqRead(**self.test_kwargs).sequence[1:2]) test_read = FastqRead(**self.test_kwargs) test_read.trim(start=2, end=4) self.assertEqual(len(test_read), 3) self.assertEqual(len(test_read.sequence), len(test_read.quality)) - self.assertEqual( - test_read.sequence, FastqRead(**self.test_kwargs).sequence[1:4] - ) + self.assertEqual(test_read.sequence, FastqRead(**self.test_kwargs).sequence[1:4]) def test_trim_bad_parameters(self) -> None: test_read = FastqRead(**self.test_kwargs) @@ -201,21 +187,15 @@ def test_trim_length(self) -> None: test_read.trim_length(start=2, length=4) self.assertEqual(len(test_read), 4) self.assertEqual(len(test_read.sequence), len(test_read.quality)) - self.assertEqual( - test_read.sequence, FastqRead(**self.test_kwargs).sequence[1:5] - ) + self.assertEqual(test_read.sequence, FastqRead(**self.test_kwargs).sequence[1:5]) def test_trim_length_bad_parameters(self) -> None: test_read = FastqRead(**self.test_kwargs) # bad start parameters - self.assertRaises( - ValueError, test_read.trim_length, start=-1, length=len(test_read) - ) + self.assertRaises(ValueError, test_read.trim_length, start=-1, length=len(test_read)) self.assertEqual(test_read, FastqRead(**self.test_kwargs)) - self.assertRaises( - ValueError, test_read.trim_length, start=0, length=len(test_read) - ) + self.assertRaises(ValueError, test_read.trim_length, start=0, length=len(test_read)) self.assertEqual(test_read, FastqRead(**self.test_kwargs)) self.assertRaises( ValueError, @@ -236,9 +216,7 @@ def test_trim_length_bad_parameters(self) -> None: # bad parameter combinations self.assertRaises(ValueError, test_read.trim_length, start=-1, length=0) self.assertEqual(test_read, FastqRead(**self.test_kwargs)) - self.assertRaises( - ValueError, test_read.trim_length, start=2, length=len(test_read) - ) + self.assertRaises(ValueError, test_read.trim_length, start=2, length=len(test_read)) self.assertEqual(test_read, FastqRead(**self.test_kwargs)) def test_reverse_complement(self) -> None: @@ -246,14 +224,10 @@ def test_reverse_complement(self) -> None: test_read.reverse_complement() self.assertEqual(test_read.header, self.test_kwargs["header"]) - self.assertEqual( - test_read.sequence, reverse_complement(self.test_kwargs["sequence"]) - ) + self.assertEqual(test_read.sequence, reverse_complement(self.test_kwargs["sequence"])) self.assertEqual(test_read.header2, self.test_kwargs["header2"]) self.assertListEqual(test_read.quality, self.test_quality[::-1]) - self.assertEqual( - test_read.quality_encoding_value, self.test_kwargs["quality_encoding_value"] - ) + self.assertEqual(test_read.quality_encoding_value, self.test_kwargs["quality_encoding_value"]) if __name__ == "__main__": diff --git a/tests/test_util_infer.py b/tests/test_util_infer.py index 9089178..bd71563 100644 --- a/tests/test_util_infer.py +++ b/tests/test_util_infer.py @@ -21,9 +21,7 @@ def test_protein(self): self.assertEqual(infer_sequence_type("MDLSALRVEE"), "protein") def test_protein_iupac(self): - self.assertEqual( - infer_sequence_type("LIVWZ", report_iupac=True), "protein-iupac" - ) + self.assertEqual(infer_sequence_type("LIVWZ", report_iupac=True), "protein-iupac") self.assertEqual(infer_sequence_type("LIVWZ", report_iupac=False), "protein") def test_lowercase(self): @@ -50,25 +48,19 @@ def test_dna(self): infer_all_sequence_types("ACGT", report_iupac=True), ["dna", "dna-iupac", "protein", "protein-iupac"], ) - self.assertListEqual( - infer_all_sequence_types("ACGT", report_iupac=False), ["dna", "protein"] - ) + self.assertListEqual(infer_all_sequence_types("ACGT", report_iupac=False), ["dna", "protein"]) self.assertListEqual( infer_all_sequence_types("TTTTT", report_iupac=True), ["dna", "dna-iupac", "protein", "protein-iupac"], ) - self.assertListEqual( - infer_all_sequence_types("TTTTT", report_iupac=False), ["dna", "protein"] - ) + self.assertListEqual(infer_all_sequence_types("TTTTT", report_iupac=False), ["dna", "protein"]) def test_dna_iupac(self): self.assertListEqual( infer_all_sequence_types("AWGT", report_iupac=True), ["dna-iupac", "protein", "protein-iupac"], ) - self.assertListEqual( - infer_all_sequence_types("AWGT", report_iupac=False), ["dna", "protein"] - ) + self.assertListEqual(infer_all_sequence_types("AWGT", report_iupac=False), ["dna", "protein"]) def test_rna(self): self.assertListEqual(infer_all_sequence_types("ACGU"), ["rna"]) @@ -79,17 +71,11 @@ def test_protein(self): infer_all_sequence_types("LIVW", report_iupac=True), ["protein", "protein-iupac"], ) - self.assertListEqual( - infer_all_sequence_types("MDLSALRVEE", report_iupac=False), ["protein"] - ) + self.assertListEqual(infer_all_sequence_types("MDLSALRVEE", report_iupac=False), ["protein"]) def test_protein_iupac(self): - self.assertListEqual( - infer_all_sequence_types("LIVWZ", report_iupac=True), ["protein-iupac"] - ) - self.assertListEqual( - infer_all_sequence_types("LIVWZ", report_iupac=False), ["protein"] - ) + self.assertListEqual(infer_all_sequence_types("LIVWZ", report_iupac=True), ["protein-iupac"]) + self.assertListEqual(infer_all_sequence_types("LIVWZ", report_iupac=False), ["protein"]) def test_lowercase(self): self.assertIsNone(infer_all_sequence_types("acgt")) diff --git a/tests/test_validator_create.py b/tests/test_validator_create.py index f707a79..d2713d6 100644 --- a/tests/test_validator_create.py +++ b/tests/test_validator_create.py @@ -64,13 +64,9 @@ def test_create_from_list(self) -> None: # invalid list arguments self.assertRaises(ValueError, create_validator, ["A", "C", "GT"]) - self.assertRaises( - ValueError, create_validator, ["A", "C", "GT"], case_sensitive=False - ) + self.assertRaises(ValueError, create_validator, ["A", "C", "GT"], case_sensitive=False) self.assertRaises(ValueError, create_validator, ["A", "C", ""]) - self.assertRaises( - ValueError, create_validator, ["A", "C", ""], case_sensitive=False - ) + self.assertRaises(ValueError, create_validator, ["A", "C", ""], case_sensitive=False) if __name__ == "__main__": From 0dece8039cf017802ac0128cb79f7371c5b2e759 Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Tue, 29 Aug 2023 15:30:40 +1000 Subject: [PATCH 4/7] add flake8 pyproject support --- .flake8 | 3 --- .pre-commit-config.yaml | 2 ++ pyproject.toml | 5 +++++ src/fqfa/fastq/fastqread.py | 4 ++-- 4 files changed, 9 insertions(+), 5 deletions(-) delete mode 100644 .flake8 diff --git a/.flake8 b/.flake8 deleted file mode 100644 index cea1085..0000000 --- a/.flake8 +++ /dev/null @@ -1,3 +0,0 @@ -[flake8] -extend-ignore = E203, E501 -max-complexity = 10 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1912aef..adf0b9a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,3 +8,5 @@ repos: rev: 5.0.4 hooks: - id: flake8 + additional_dependencies: + - Flake8-pyproject diff --git a/pyproject.toml b/pyproject.toml index 4b058c9..d133e81 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,11 +34,16 @@ dev = [ "flake8", "pre-commit", "pytest", + "Flake8-pyproject", ] [tool.black] line-length = 120 +[tool.flake8] +extend-ignore = ["E203", "E501"] +max-complexity = 10 + [tool.hatch.version] path = "src/fqfa/__init__.py" diff --git a/src/fqfa/fastq/fastqread.py b/src/fqfa/fastq/fastqread.py index 6e2019e..af7aba6 100644 --- a/src/fqfa/fastq/fastqread.py +++ b/src/fqfa/fastq/fastqread.py @@ -96,7 +96,7 @@ def __post_init__(self, quality_string: str) -> None: # mypy false positive: https://github.com/python/mypy/issues/5485 raise ValueError("unexpected characters in sequence") - quality_string_bytes = quality_string.encode('ascii') + quality_string_bytes = quality_string.encode("ascii") qev = self.quality_encoding_value self.quality = [qsb - qev for qsb in quality_string_bytes] @@ -137,7 +137,7 @@ def average_quality(self) -> float: Mean quality value. """ - return sum(self.quality)/len(self.quality) + return sum(self.quality) / len(self.quality) def min_quality(self) -> int: """Calculates and returns the read's minimum quality value. From fb556c2e48cce00fb73550a7e03c2c8d473cd6a5 Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Tue, 29 Aug 2023 15:33:16 +1000 Subject: [PATCH 5/7] update version --- src/fqfa/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fqfa/__init__.py b/src/fqfa/__init__.py index 1e7fbba..0ecbff1 100644 --- a/src/fqfa/__init__.py +++ b/src/fqfa/__init__.py @@ -9,7 +9,7 @@ ) from fqfa.util.translate import translate_dna, ncbi_genetic_code_to_dict -__version__ = "1.2.3" +__version__ = "1.3.0" __all__ = [ "__version__", From d2f6fc431962841b44a68fcfa2c0e844cd77985e Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Tue, 29 Aug 2023 16:26:13 +1000 Subject: [PATCH 6/7] update actions installation to use pyproject not requirements --- .github/workflows/python-package.yml | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 88d6cd6..f34b399 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -24,19 +24,15 @@ jobs: uses: actions/setup-python@v3 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - python -m pip install flake8 pytest mypy - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Install package run: | - python -m pip install . + python -m pip install --upgrade pip + python -m pip install .[dev] - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + # exit-zero treats all errors as warnings flake8 . --count --exit-zero --statistics - name: Test with pytest run: | From 69b29cd7087dddd2012bd0ee8d05e6ea602814de Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Tue, 29 Aug 2023 16:27:36 +1000 Subject: [PATCH 7/7] add missing dev dependency --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index d133e81..2a7ef4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ dev = [ "pre-commit", "pytest", "Flake8-pyproject", + "mypy", ] [tool.black]