added tests

hubmapconsortium · Oct 21, 2024 · 7c93ab5 · 7c93ab5
1 parent 396cebb
commit 7c93ab5
Show file tree

Hide file tree

Showing 2 changed files with 119 additions and 70 deletions.
diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py
@@ -20,10 +20,18 @@ def is_valid_filename(filename: str) -> bool:
 
 
 def get_prefix_read_type_and_set(filename: str) -> Optional[filename_pattern]:
+    """
+    PREFIX (?P<prefix>.*(?:L\\d+)(?=[_](?:(?P<read_type>(?:R|read)(?=[123])|I(?=[12])))))
+        - (?P<prefix> | named capture group "prefix"
+        - .*(?:L\\d+) | match anything before pattern L# (where # represents 1 or more digits)
+        - (?=[_](?:(?P<read_type>(?:R|read)(?=[123])|I(?=[12])))) | only capture above match if followed by the sequence _[R1,R2,R3,read1,read2,read3,I1,I2]
+
+    """
     if not bool(fastq_utils.FASTQ_PATTERN.fullmatch(filename)):
         return
+    # looking for fastq filenames matching pattern <prefix>_<lane>_[I1,I2,R1,R2,R3]_<set>
     pattern = re.compile(
-        r"(?P<prefix>.*(?=[_](?:(?P<read_type>R|.read|I)\d)))(?:_(?P=read_type)\d_)(?P<set>\d+)"
+        r"(?P<prefix>.*(?:L\d+)(?=[_](?:(?P<read_type>(?:R|read)(?=[123]_)|I(?=[12]_)))))"
     )
     groups = pattern.match(filename)
     if groups and all(x in groups.groupdict().keys() for x in ["prefix", "read_type", "set"]):
@@ -140,6 +148,8 @@ def _make_groups(self) -> dict[filename_pattern, list[Path]]:
                 groups[potential_match].append(file)
             else:
                 self._ungrouped_files.append(file)
+        for group in groups.values():
+            group.sort()
         return groups
 
     _VALIDATE_FASTQ_LINE_METHODS = {
@@ -269,7 +279,7 @@ def _find_counts(self, groups: dict[filename_pattern, list[Path]], lock):
             for pattern, paths in groups.items():
                 comparison = {}
                 for path in paths:
-                    comparison[path] = self._file_record_counts.get(str(path))
+                    comparison[str(path)] = self._file_record_counts.get(str(path))
                 if not (len(set(comparison.values())) == 1):
                     self.errors.append(
                         f"Counts do not match among files matching pattern {pattern.prefix}_{pattern.read_type}#_{pattern.set_num}: {comparison}"

diff --git a/tests/test_fastq_validator_logic.py b/tests/test_fastq_validator_logic.py
@@ -1,10 +1,15 @@
 import gzip
-from pathlib import Path
+from pathlib import Path, PosixPath
 from typing import TextIO
 
+import fastq_utils
 import pytest
 
-from src.ingest_validation_tests.fastq_validator_logic import FASTQValidatorLogic
+from src.ingest_validation_tests.fastq_validator_logic import (
+    FASTQValidatorLogic,
+    filename_pattern,
+    get_prefix_read_type_and_set,
+)
 
 _GOOD_RECORDS = """\
 @A12345:123:A12BCDEFG:1:1234:1000:1234 1:N:0:NACTGACTGA+CTGACTGACT
@@ -197,11 +202,14 @@ def test_fastq_validator_record_counts_bad(self, fastq_validator, tmp_path):
 
         fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
 
-        # Order of the files being processed is not guaranteed, however these
-        # strings ensure that a mismatch was found.
-        assert "(4 lines)" in fastq_validator.errors[0]
-        assert "does not match" in fastq_validator.errors[0]
-        assert "(8 lines)" in fastq_validator.errors[0]
+        # Non-matching records only stored in errors, need to do ugly string match
+        assert "Counts do not match" in fastq_validator.errors[0]
+        assert (
+            "SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq': 4" in fastq_validator.errors[0]
+        )
+        assert (
+            "SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq': 8" in fastq_validator.errors[0]
+        )
 
     def test_fastq_comparison_good(self, fastq_validator, tmp_path):
         filenames = [
@@ -219,67 +227,6 @@ def test_fastq_comparison_good(self, fastq_validator, tmp_path):
 
         assert not fastq_validator.errors
 
-    # def test_fastq_comparison_bad_extra_R(self, fastq_validator, tmp_path):
-    #     filenames = [
-    #         "3252_ftL_RNA_T1_S31_L003_R1_001.fastq",
-    #         "3252_ftL_RNA_T1_S31_L003_R2_001.fastq",
-    #         "3252_ftL_RNA_T1_S31_L003_R1_002.fastq",
-    #         "3252_ftL_RNA_T1_S31_L003_R2_002.fastq",
-    #         "3252_ftL_RNA_T1_S31_L003_R3_001.fastq",
-    #     ]
-    #     for filename in filenames:
-    #         new_file = tmp_path.joinpath(filename)
-    #         with _open_output_file(new_file, False) as output:
-    #             output.write(_GOOD_RECORDS)
-    #
-    #     fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
-    #     assert "IndexError: list index out of range" in fastq_validator.errors[0]
-
-    # def test_fastq_comparison_bad_unpaired_R(self, fastq_validator, tmp_path):
-    #     filenames = [
-    #         "3252_ftL_RNA_T1_S31_L003_R1_001.fastq",
-    #         "3252_ftL_RNA_T1_S31_L003_R2_001.fastq",
-    #         "3252_ftL_RNA_T1_S31_L003_R1_002.fastq",
-    #     ]
-    #     for filename in filenames:
-    #         new_file = tmp_path.joinpath(filename)
-    #         with _open_output_file(new_file, False) as output:
-    #             output.write(_GOOD_RECORDS)
-    #
-    #     fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
-    #
-    #     assert fastq_validator.errors
-    #
-    # def test_fastq_comparison_bad_mixed_I_and_R(self, fastq_validator, tmp_path):
-    #     filenames = [
-    #         "3252_ftL_RNA_T1_S31_L003_R1_001.fastq",
-    #         "3252_ftL_RNA_T1_S31_L003_R2_001.fastq",
-    #         "3252_ftL_RNA_T1_S31_L003_I1_002.fastq",
-    #     ]
-    #     for filename in filenames:
-    #         new_file = tmp_path.joinpath(filename)
-    #         with _open_output_file(new_file, False) as output:
-    #             output.write(_GOOD_RECORDS)
-    #
-    #     fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
-    #
-    #     assert fastq_validator.errors
-    #
-    # def test_fastq_comparison_bad_extra_unmatched_fastq(self, fastq_validator, tmp_path):
-    #     filenames = [
-    #         "3252_ftL_RNA_T1_S31_L003_R1_001.fastq",
-    #         "3252_ftL_RNA_T1_S31_L003_R2_001.fastq",
-    #         "bad_ftL_RNA_T1_S31_L003_R1_001.fastq",
-    #     ]
-    #     for filename in filenames:
-    #         new_file = tmp_path.joinpath(filename)
-    #         with _open_output_file(new_file, False) as output:
-    #             output.write(_GOOD_RECORDS)
-    #
-    #     fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
-    #
-    #     assert fastq_validator.errors
-
     def test_fastq_comparison_bad_unequal_line_counts(self, fastq_validator, tmp_path):
         good_file = "3252_ftL_RNA_T1_S31_L003_R1_001.fastq"
         bad_file = "3252_ftL_RNA_T1_S31_L003_R2_001.fastq"
@@ -293,3 +240,95 @@ def test_fastq_comparison_bad_unequal_line_counts(self, fastq_validator, tmp_pat
         fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
 
         assert fastq_validator.errors
+
+    def test_fastq_groups_good(self, fastq_validator, tmp_path):
+        files = [
+            "20147_Healthy_PA_S1_L001_R1_001.fastq",
+            "20147_Healthy_PA_S1_L001_R2_001.fastq",
+            "20147_Healthy_PA_S1_L001_R3_001.fastq",
+            "20147_Healthy_PA_S1_L001_R1_002.fastq",
+            "20147_Healthy_PA_S1_L001_R2_002.fastq",
+        ]
+        for file in files:
+            with _open_output_file(tmp_path.joinpath(file), False) as output:
+                output.write(_GOOD_RECORDS)
+
+        fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
+        assert fastq_validator._make_groups() == {
+            filename_pattern(prefix="20147_Healthy_PA_S1_L001", read_type="R", set_num="001"): [
+                PosixPath(tmp_path.joinpath("20147_Healthy_PA_S1_L001_R1_001.fastq")),
+                PosixPath(tmp_path.joinpath("20147_Healthy_PA_S1_L001_R2_001.fastq")),
+                PosixPath(tmp_path.joinpath("20147_Healthy_PA_S1_L001_R3_001.fastq")),
+            ],
+            filename_pattern(prefix="20147_Healthy_PA_S1_L001", read_type="R", set_num="002"): [
+                PosixPath(tmp_path.joinpath("20147_Healthy_PA_S1_L001_R1_002.fastq")),
+                PosixPath(tmp_path.joinpath("20147_Healthy_PA_S1_L001_R2_002.fastq")),
+            ],
+        }
+        assert not fastq_validator.errors
+
+    def test_fastq_groups_bad(self, fastq_validator, tmp_path):
+        good_files = [
+            "20147_Healthy_PA_S1_L001_R1_001.fastq",
+            "20147_Healthy_PA_S1_L001_R2_001.fastq",
+            "20147_Healthy_PA_S1_L001_R1_002.fastq",
+        ]
+        bad_files = [
+            "20147_Healthy_PA_S1_L001_R3_001.fastq",
+            "20147_Healthy_PA_S1_L001_R2_002.fastq",
+        ]
+        for file in good_files:
+            with _open_output_file(tmp_path.joinpath(file), False) as output:
+                output.write(_GOOD_RECORDS)
+        for file in bad_files:
+            with _open_output_file(tmp_path.joinpath(file), False) as output:
+                output.write("@bad")
+
+        fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
+        assert "Counts do not match" in fastq_validator.errors[0]
+        assert "20147_Healthy_PA_S1_L001_R2_002.fastq" in fastq_validator.errors[0]
+        assert "Counts do not match" in fastq_validator.errors[1]
+        assert "20147_Healthy_PA_S1_L001_R3_001.fastq" in fastq_validator.errors[1]
+
+    def test_filename_valid_and_fastq_valid_but_not_grouped(self, fastq_validator, tmp_path):
+        # good_filenames[0:6] are valid but would not be grouped for comparison
+        good_filenames = [
+            "B001A001_1.fastq",  # no lane, read_type, or set
+            "B001A001_R1.fq",  # no lane or set
+            "B001A001_I1.fastq.gz",  # no lane or set
+            "H4L1-4_S64_R1_001.fastq.gz",  # no lane
+            "H4L1-4_S64_L001_001.fastq.gz",  # no read_type
+            "H4L1-4_S64_L001_R1.fastq.gz",  # no set
+            "L001_H4L1-4_S64_R1_001.fastq.gz",  # out of order
+            "H4L1-4_S64_L001_R1_001.fastq.gz",
+            "H4L1-4_S64_L001_R2_001.fastq.gz",
+            "H4L1-4_S64_L001_I1_001.fastq.gz",
+            "Undetermined_S0_L001_R1_001.W105_Small_bowel_ileum.trimmed.fastq.gz",  # annotated but otherwise fits pattern
+        ]
+        for file in good_filenames:
+            use_gzip = bool("gz" in file)
+            with _open_output_file(tmp_path.joinpath(file), use_gzip) as output:
+                output.write(_GOOD_RECORDS)
+
+        fastq_validator.validate_fastq_files_in_path([tmp_path], 2)
+        # All files in good_filenames should be in file_list
+        assert {
+            PosixPath(tmp_path.joinpath(file)) in fastq_validator.file_list
+            for file in good_filenames
+        } == {True}
+        # No errors should be found in any of those files
+        assert not fastq_validator.errors
+        # Only some files from good_filenames will match criteria for grouping
+        valid_filename_patterns = [
+            get_prefix_read_type_and_set(str(file))
+            for file in fastq_validator.file_list
+            if get_prefix_read_type_and_set(str(file)) is not None
+        ]
+        assert valid_filename_patterns == [
+            filename_pattern(prefix=f"{tmp_path}/H4L1-4_S64_L001", read_type="R", set_num="001"),
+            filename_pattern(prefix=f"{tmp_path}/H4L1-4_S64_L001", read_type="R", set_num="001"),
+            filename_pattern(prefix=f"{tmp_path}/H4L1-4_S64_L001", read_type="I", set_num="001"),
+            filename_pattern(
+                prefix=f"{tmp_path}/Undetermined_S0_L001", read_type="R", set_num="001"
+            ),
+        ]