Skip to content

Commit

Permalink
added tests
Browse files Browse the repository at this point in the history
  • Loading branch information
gesinaphillips committed Oct 21, 2024
1 parent 396cebb commit 7c93ab5
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 70 deletions.
14 changes: 12 additions & 2 deletions src/ingest_validation_tests/fastq_validator_logic.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,18 @@ def is_valid_filename(filename: str) -> bool:


def get_prefix_read_type_and_set(filename: str) -> Optional[filename_pattern]:
"""
PREFIX (?P<prefix>.*(?:L\\d+)(?=[_](?:(?P<read_type>(?:R|read)(?=[123])|I(?=[12])))))
- (?P<prefix> | named capture group "prefix"
- .*(?:L\\d+) | match anything before pattern L# (where # represents 1 or more digits)
- (?=[_](?:(?P<read_type>(?:R|read)(?=[123])|I(?=[12])))) | only capture above match if followed by the sequence _[R1,R2,R3,read1,read2,read3,I1,I2]
"""
if not bool(fastq_utils.FASTQ_PATTERN.fullmatch(filename)):
return
# looking for fastq filenames matching pattern <prefix>_<lane>_[I1,I2,R1,R2,R3]_<set>
pattern = re.compile(
r"(?P<prefix>.*(?=[_](?:(?P<read_type>R|.read|I)\d)))(?:_(?P=read_type)\d_)(?P<set>\d+)"
r"(?P<prefix>.*(?:L\d+)(?=[_](?:(?P<read_type>(?:R|read)(?=[123]_)|I(?=[12]_)))))"
)
groups = pattern.match(filename)
if groups and all(x in groups.groupdict().keys() for x in ["prefix", "read_type", "set"]):
Expand Down Expand Up @@ -140,6 +148,8 @@ def _make_groups(self) -> dict[filename_pattern, list[Path]]:
groups[potential_match].append(file)
else:
self._ungrouped_files.append(file)
for group in groups.values():
group.sort()
return groups

_VALIDATE_FASTQ_LINE_METHODS = {
Expand Down Expand Up @@ -269,7 +279,7 @@ def _find_counts(self, groups: dict[filename_pattern, list[Path]], lock):
for pattern, paths in groups.items():
comparison = {}
for path in paths:
comparison[path] = self._file_record_counts.get(str(path))
comparison[str(path)] = self._file_record_counts.get(str(path))
if not (len(set(comparison.values())) == 1):
self.errors.append(
f"Counts do not match among files matching pattern {pattern.prefix}_{pattern.read_type}#_{pattern.set_num}: {comparison}"
Expand Down
175 changes: 107 additions & 68 deletions tests/test_fastq_validator_logic.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
import gzip
from pathlib import Path
from pathlib import Path, PosixPath
from typing import TextIO

import fastq_utils
import pytest

from src.ingest_validation_tests.fastq_validator_logic import FASTQValidatorLogic
from src.ingest_validation_tests.fastq_validator_logic import (
FASTQValidatorLogic,
filename_pattern,
get_prefix_read_type_and_set,
)

_GOOD_RECORDS = """\
@A12345:123:A12BCDEFG:1:1234:1000:1234 1:N:0:NACTGACTGA+CTGACTGACT
Expand Down Expand Up @@ -197,11 +202,14 @@ def test_fastq_validator_record_counts_bad(self, fastq_validator, tmp_path):

fastq_validator.validate_fastq_files_in_path([tmp_path], 2)

# Order of the files being processed is not guaranteed, however these
# strings ensure that a mismatch was found.
assert "(4 lines)" in fastq_validator.errors[0]
assert "does not match" in fastq_validator.errors[0]
assert "(8 lines)" in fastq_validator.errors[0]
# Non-matching records only stored in errors, need to do ugly string match
assert "Counts do not match" in fastq_validator.errors[0]
assert (
"SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq': 4" in fastq_validator.errors[0]
)
assert (
"SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq': 8" in fastq_validator.errors[0]
)

def test_fastq_comparison_good(self, fastq_validator, tmp_path):
filenames = [
Expand All @@ -219,67 +227,6 @@ def test_fastq_comparison_good(self, fastq_validator, tmp_path):

assert not fastq_validator.errors

# def test_fastq_comparison_bad_extra_R(self, fastq_validator, tmp_path):
# filenames = [
# "3252_ftL_RNA_T1_S31_L003_R1_001.fastq",
# "3252_ftL_RNA_T1_S31_L003_R2_001.fastq",
# "3252_ftL_RNA_T1_S31_L003_R1_002.fastq",
# "3252_ftL_RNA_T1_S31_L003_R2_002.fastq",
# "3252_ftL_RNA_T1_S31_L003_R3_001.fastq",
# ]
# for filename in filenames:
# new_file = tmp_path.joinpath(filename)
# with _open_output_file(new_file, False) as output:
# output.write(_GOOD_RECORDS)
#
# fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
# assert "IndexError: list index out of range" in fastq_validator.errors[0]

# def test_fastq_comparison_bad_unpaired_R(self, fastq_validator, tmp_path):
# filenames = [
# "3252_ftL_RNA_T1_S31_L003_R1_001.fastq",
# "3252_ftL_RNA_T1_S31_L003_R2_001.fastq",
# "3252_ftL_RNA_T1_S31_L003_R1_002.fastq",
# ]
# for filename in filenames:
# new_file = tmp_path.joinpath(filename)
# with _open_output_file(new_file, False) as output:
# output.write(_GOOD_RECORDS)
#
# fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
#
# assert fastq_validator.errors
#
# def test_fastq_comparison_bad_mixed_I_and_R(self, fastq_validator, tmp_path):
# filenames = [
# "3252_ftL_RNA_T1_S31_L003_R1_001.fastq",
# "3252_ftL_RNA_T1_S31_L003_R2_001.fastq",
# "3252_ftL_RNA_T1_S31_L003_I1_002.fastq",
# ]
# for filename in filenames:
# new_file = tmp_path.joinpath(filename)
# with _open_output_file(new_file, False) as output:
# output.write(_GOOD_RECORDS)
#
# fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
#
# assert fastq_validator.errors
#
# def test_fastq_comparison_bad_extra_unmatched_fastq(self, fastq_validator, tmp_path):
# filenames = [
# "3252_ftL_RNA_T1_S31_L003_R1_001.fastq",
# "3252_ftL_RNA_T1_S31_L003_R2_001.fastq",
# "bad_ftL_RNA_T1_S31_L003_R1_001.fastq",
# ]
# for filename in filenames:
# new_file = tmp_path.joinpath(filename)
# with _open_output_file(new_file, False) as output:
# output.write(_GOOD_RECORDS)
#
# fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
#
# assert fastq_validator.errors

def test_fastq_comparison_bad_unequal_line_counts(self, fastq_validator, tmp_path):
good_file = "3252_ftL_RNA_T1_S31_L003_R1_001.fastq"
bad_file = "3252_ftL_RNA_T1_S31_L003_R2_001.fastq"
Expand All @@ -293,3 +240,95 @@ def test_fastq_comparison_bad_unequal_line_counts(self, fastq_validator, tmp_pat
fastq_validator.validate_fastq_files_in_path([tmp_path], 2)

assert fastq_validator.errors

def test_fastq_groups_good(self, fastq_validator, tmp_path):
files = [
"20147_Healthy_PA_S1_L001_R1_001.fastq",
"20147_Healthy_PA_S1_L001_R2_001.fastq",
"20147_Healthy_PA_S1_L001_R3_001.fastq",
"20147_Healthy_PA_S1_L001_R1_002.fastq",
"20147_Healthy_PA_S1_L001_R2_002.fastq",
]
for file in files:
with _open_output_file(tmp_path.joinpath(file), False) as output:
output.write(_GOOD_RECORDS)

fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
assert fastq_validator._make_groups() == {
filename_pattern(prefix="20147_Healthy_PA_S1_L001", read_type="R", set_num="001"): [
PosixPath(tmp_path.joinpath("20147_Healthy_PA_S1_L001_R1_001.fastq")),
PosixPath(tmp_path.joinpath("20147_Healthy_PA_S1_L001_R2_001.fastq")),
PosixPath(tmp_path.joinpath("20147_Healthy_PA_S1_L001_R3_001.fastq")),
],
filename_pattern(prefix="20147_Healthy_PA_S1_L001", read_type="R", set_num="002"): [
PosixPath(tmp_path.joinpath("20147_Healthy_PA_S1_L001_R1_002.fastq")),
PosixPath(tmp_path.joinpath("20147_Healthy_PA_S1_L001_R2_002.fastq")),
],
}
assert not fastq_validator.errors

def test_fastq_groups_bad(self, fastq_validator, tmp_path):
good_files = [
"20147_Healthy_PA_S1_L001_R1_001.fastq",
"20147_Healthy_PA_S1_L001_R2_001.fastq",
"20147_Healthy_PA_S1_L001_R1_002.fastq",
]
bad_files = [
"20147_Healthy_PA_S1_L001_R3_001.fastq",
"20147_Healthy_PA_S1_L001_R2_002.fastq",
]
for file in good_files:
with _open_output_file(tmp_path.joinpath(file), False) as output:
output.write(_GOOD_RECORDS)
for file in bad_files:
with _open_output_file(tmp_path.joinpath(file), False) as output:
output.write("@bad")

fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
assert "Counts do not match" in fastq_validator.errors[0]
assert "20147_Healthy_PA_S1_L001_R2_002.fastq" in fastq_validator.errors[0]
assert "Counts do not match" in fastq_validator.errors[1]
assert "20147_Healthy_PA_S1_L001_R3_001.fastq" in fastq_validator.errors[1]

def test_filename_valid_and_fastq_valid_but_not_grouped(self, fastq_validator, tmp_path):
# good_filenames[0:6] are valid but would not be grouped for comparison
good_filenames = [
"B001A001_1.fastq", # no lane, read_type, or set
"B001A001_R1.fq", # no lane or set
"B001A001_I1.fastq.gz", # no lane or set
"H4L1-4_S64_R1_001.fastq.gz", # no lane
"H4L1-4_S64_L001_001.fastq.gz", # no read_type
"H4L1-4_S64_L001_R1.fastq.gz", # no set
"L001_H4L1-4_S64_R1_001.fastq.gz", # out of order
"H4L1-4_S64_L001_R1_001.fastq.gz",
"H4L1-4_S64_L001_R2_001.fastq.gz",
"H4L1-4_S64_L001_I1_001.fastq.gz",
"Undetermined_S0_L001_R1_001.W105_Small_bowel_ileum.trimmed.fastq.gz", # annotated but otherwise fits pattern
]
for file in good_filenames:
use_gzip = bool("gz" in file)
with _open_output_file(tmp_path.joinpath(file), use_gzip) as output:
output.write(_GOOD_RECORDS)

fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
# All files in good_filenames should be in file_list
assert {
PosixPath(tmp_path.joinpath(file)) in fastq_validator.file_list
for file in good_filenames
} == {True}
# No errors should be found in any of those files
assert not fastq_validator.errors
# Only some files from good_filenames will match criteria for grouping
valid_filename_patterns = [
get_prefix_read_type_and_set(str(file))
for file in fastq_validator.file_list
if get_prefix_read_type_and_set(str(file)) is not None
]
assert valid_filename_patterns == [
filename_pattern(prefix=f"{tmp_path}/H4L1-4_S64_L001", read_type="R", set_num="001"),
filename_pattern(prefix=f"{tmp_path}/H4L1-4_S64_L001", read_type="R", set_num="001"),
filename_pattern(prefix=f"{tmp_path}/H4L1-4_S64_L001", read_type="I", set_num="001"),
filename_pattern(
prefix=f"{tmp_path}/Undetermined_S0_L001", read_type="R", set_num="001"
),
]

0 comments on commit 7c93ab5

Please sign in to comment.