-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #139 from phac-nml/file_count_validation
File count validation
- Loading branch information
Showing
17 changed files
with
300 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
VERSION_NUMBER = "0.8.1" | ||
VERSION_NUMBER = "0.8.2" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import iridauploader.model as model | ||
from iridauploader.parsers.exceptions.sequence_file_error import SequenceFileError | ||
|
||
|
||
def validate_uniform_file_count(sequencing_run): | ||
""" | ||
Validate the files in a SequencingRun object are all either in pairs, or single files | ||
:param sequencing_run: SequencingRun object to validate | ||
:return: ValidationResult object with list of errors if any | ||
""" | ||
paired = sequencing_run.is_paired_end() | ||
expected_file_count = 2 if paired else 1 | ||
|
||
validation_result = model.ValidationResult() | ||
|
||
for p in sequencing_run.project_list: | ||
for s in p.sample_list: | ||
# do validation of files | ||
if not _matching_find_count(s, paired): | ||
error_msg = "File count for sample `{}` does not match the expected file count `{}`. " \ | ||
"Please verify your data.".format(s.sample_name, expected_file_count) | ||
validation_result.add_error(SequenceFileError(error_msg)) | ||
|
||
return validation_result | ||
|
||
|
||
def _matching_find_count(sample, paired): | ||
""" | ||
checks paired end / single end file matching on sample | ||
:param sample: Sample object | ||
:param paired: boolean | ||
:return: boolean | ||
""" | ||
return sample.sequence_file.is_paired_end() == paired |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file added
BIN
+864 Bytes
.../core/fake_ngs_data_name_error/Data/Intensities/BaseCalls/01-1111_S1_L001_R1_001.fastq.gz
Binary file not shown.
Binary file added
BIN
+864 Bytes
.../core/fake_ngs_data_name_error/Data/Intensities/BaseCalls/01-1111_S1_L001_R2_001.fastq.gz
Binary file not shown.
Binary file added
BIN
+864 Bytes
.../core/fake_ngs_data_name_error/Data/Intensities/BaseCalls/02-2222_S1_L001_R1_001.fastq.gz
Binary file not shown.
Binary file added
BIN
+864 Bytes
...core/fake_ngs_data_name_error/Data/Intensities/BaseCalls/02-2222x_S1_L001_R2_001.fastq.gz
Binary file not shown.
Binary file added
BIN
+864 Bytes
.../core/fake_ngs_data_name_error/Data/Intensities/BaseCalls/03-3333_S1_L001_R1_001.fastq.gz
Binary file not shown.
Binary file added
BIN
+864 Bytes
.../core/fake_ngs_data_name_error/Data/Intensities/BaseCalls/03-3333_S1_L001_R2_001.fastq.gz
Binary file not shown.
24 changes: 24 additions & 0 deletions
24
iridauploader/tests/core/fake_ngs_data_name_error/SampleSheet.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
[Header] | ||
IEMFileVersion,4 | ||
Investigator Name,Denji | ||
Experiment Name,1 | ||
Date,12/3/2018 | ||
Workflow,GenerateFASTQ | ||
Application,FASTQ Only | ||
Assay,Nextera XT | ||
Description,Chainsaw | ||
Chemistry,Moter Oil | ||
|
||
[Reads] | ||
251 | ||
250 | ||
|
||
[Settings] | ||
ReverseComplement,0 | ||
Adapter,AAAAGGGGAAAAGGGGAAA | ||
|
||
[Data] | ||
Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description | ||
01-1111,01-1111,1,01,N01,AAAAAAAA,S01,TTTTTTTT,6,Super bug | ||
02-2222,02-2222,2,02,N02,GGGGGGGG,S02,CCCCCCCC,6,Scary bug | ||
03-3333,03-3333,3,03,N03,CCCCCCCC,S03,GGGGGGGG,6,Deadly bug |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
209 changes: 209 additions & 0 deletions
209
iridauploader/tests/core/test_uniform_file_count_validator.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,209 @@ | ||
import unittest | ||
from os import path | ||
|
||
|
||
from iridauploader.core import uniform_file_count_validator | ||
from iridauploader.parsers.miseq import parser as miseq_parser | ||
from iridauploader.parsers.exceptions import SequenceFileError | ||
from iridauploader import model | ||
|
||
path_to_module = path.abspath(path.dirname(__file__)) | ||
if len(path_to_module) == 0: | ||
path_to_module = '.' | ||
|
||
|
||
class TestValidateFileSizeMinimum(unittest.TestCase): | ||
""" | ||
Testing the validate_uniform_file_count function | ||
""" | ||
|
||
def setUp(self): | ||
print("\nStarting " + self.__module__ + ": " + self._testMethodName) | ||
|
||
@staticmethod | ||
def _make_seq_run_paired(): | ||
""" | ||
Make a sequencing run pointed at real data for the tests | ||
This dataset is a paired end run | ||
:return: SequencingRun object | ||
""" | ||
files_1 = model.SequenceFile([ | ||
path.join(path_to_module, | ||
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "01-1111_S1_L001_R1_001.fastq.gz"), | ||
path.join(path_to_module, | ||
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "01-1111_S1_L001_R2_001.fastq.gz"), | ||
]) | ||
files_2 = model.SequenceFile([ | ||
path.join(path_to_module, | ||
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "02-2222_S1_L001_R1_001.fastq.gz"), | ||
path.join(path_to_module, | ||
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "02-2222_S1_L001_R2_001.fastq.gz"), | ||
]) | ||
files_3 = model.SequenceFile([ | ||
path.join(path_to_module, | ||
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "03-3333_S1_L001_R1_001.fastq.gz"), | ||
path.join(path_to_module, | ||
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "03-3333_S1_L001_R2_001.fastq.gz"), | ||
]) | ||
sample_1 = model.Sample("test_sample", "description", 1) | ||
sample_1.sequence_file = files_1 | ||
sample_2 = model.Sample("test_sample", "description", 1) | ||
sample_2.sequence_file = files_2 | ||
sample_3 = model.Sample("test_sample", "description", 1) | ||
sample_3.sequence_file = files_3 | ||
project = model.Project("test_project", [sample_1, sample_2, sample_3], "description") | ||
sequencing_run = model.SequencingRun({"layoutType": "PAIRED_END"}, [project], "miseq") | ||
return sequencing_run | ||
|
||
@staticmethod | ||
def _make_seq_run_single(): | ||
""" | ||
Make a sequencing run pointed at real data for the tests | ||
This dataset is a single end run | ||
:return: SequencingRun object | ||
""" | ||
files_1 = model.SequenceFile([ | ||
path.join(path_to_module, | ||
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "01-1111_S1_L001_R1_001.fastq.gz"), | ||
]) | ||
files_2 = model.SequenceFile([ | ||
path.join(path_to_module, | ||
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "02-2222_S1_L001_R1_001.fastq.gz"), | ||
]) | ||
files_3 = model.SequenceFile([ | ||
path.join(path_to_module, | ||
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "03-3333_S1_L001_R1_001.fastq.gz"), | ||
]) | ||
sample_1 = model.Sample("test_sample", "description", 1) | ||
sample_1.sequence_file = files_1 | ||
sample_2 = model.Sample("test_sample", "description", 1) | ||
sample_2.sequence_file = files_2 | ||
sample_3 = model.Sample("test_sample", "description", 1) | ||
sample_3.sequence_file = files_3 | ||
project = model.Project("test_project", [sample_1, sample_2, sample_3], "description") | ||
sequencing_run = model.SequencingRun({"layoutType": "SINGLE_END"}, [project], "miseq") | ||
return sequencing_run | ||
|
||
@staticmethod | ||
def _make_seq_run_mixed(): | ||
""" | ||
Make a sequencing run pointed at real data for the tests | ||
This dataset mixes single end and paired end runs, but identifies as paired end | ||
:return: SequencingRun object | ||
""" | ||
files_1 = model.SequenceFile([ | ||
path.join(path_to_module, | ||
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "01-1111_S1_L001_R1_001.fastq.gz"), | ||
path.join(path_to_module, | ||
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "01-1111_S1_L001_R2_001.fastq.gz"), | ||
]) | ||
files_2 = model.SequenceFile([ | ||
path.join(path_to_module, | ||
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "02-2222_S1_L001_R1_001.fastq.gz"), | ||
]) | ||
files_3 = model.SequenceFile([ | ||
path.join(path_to_module, | ||
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "03-3333_S1_L001_R1_001.fastq.gz"), | ||
]) | ||
sample_1 = model.Sample("test_sample", "description", 1) | ||
sample_1.sequence_file = files_1 | ||
sample_2 = model.Sample("test_sample", "description", 1) | ||
sample_2.sequence_file = files_2 | ||
sample_3 = model.Sample("test_sample", "description", 1) | ||
sample_3.sequence_file = files_3 | ||
project = model.Project("test_project", [sample_1, sample_2, sample_3], "description") | ||
sequencing_run = model.SequencingRun({"layoutType": "PAIRED_END"}, [project], "miseq") | ||
return sequencing_run | ||
|
||
@staticmethod | ||
def _make_seq_run_paired_with_incorrect_file_name(): | ||
""" | ||
Make a sequencing run pointed at real data for the tests | ||
This dataset is a paired end run with a common user error of a misnamed file | ||
:return: SequencingRun object | ||
""" | ||
files_1 = model.SequenceFile([ | ||
path.join(path_to_module, | ||
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "01-1111_S1_L001_R1_001.fastq.gz"), | ||
path.join(path_to_module, | ||
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "01-1111_S1_L001_R2_001.fastq.gz"), | ||
]) | ||
files_2 = model.SequenceFile([ | ||
path.join(path_to_module, | ||
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "02-2222_S1_L001_R1_001.fastq.gz"), | ||
path.join(path_to_module, | ||
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "02-2222x_S1_L001_R2_001.fastq.gz"), | ||
]) | ||
files_3 = model.SequenceFile([ | ||
path.join(path_to_module, | ||
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "03-3333_S1_L001_R1_001.fastq.gz"), | ||
path.join(path_to_module, | ||
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "03-3333_S1_L001_R2_001.fastq.gz"), | ||
]) | ||
sample_1 = model.Sample("test_sample", "description", 1) | ||
sample_1.sequence_file = files_1 | ||
sample_2 = model.Sample("test_sample", "description", 1) | ||
sample_2.sequence_file = files_2 | ||
sample_3 = model.Sample("test_sample", "description", 1) | ||
sample_3.sequence_file = files_3 | ||
project = model.Project("test_project", [sample_1, sample_2, sample_3], "description") | ||
sequencing_run = model.SequencingRun({"layoutType": "PAIRED_END"}, [project], "miseq") | ||
return sequencing_run | ||
|
||
def test_run_valid_paired(self): | ||
sequencing_run = self._make_seq_run_paired() | ||
|
||
# Run code to test | ||
res = uniform_file_count_validator.validate_uniform_file_count(sequencing_run) | ||
|
||
# validate result | ||
self.assertTrue(res.is_valid(), "valid run is being detected as invalid") | ||
|
||
def test_run_valid_single(self): | ||
sequencing_run = self._make_seq_run_single() | ||
|
||
# Run code to test | ||
res = uniform_file_count_validator.validate_uniform_file_count(sequencing_run) | ||
|
||
# validate result | ||
self.assertTrue(res.is_valid(), "valid run is being detected as invalid") | ||
|
||
def test_run_valid_mixed_expecting_paired(self): | ||
sequencing_run = self._make_seq_run_mixed() | ||
|
||
# Run code to test | ||
res = uniform_file_count_validator.validate_uniform_file_count(sequencing_run) | ||
|
||
# validate result | ||
self.assertFalse(res.is_valid(), "invalid run is being detected as valid") | ||
|
||
def test_run_valid_mixed_expecting_single(self): | ||
sequencing_run = self._make_seq_run_mixed() | ||
# change metadata to expect single end run | ||
sequencing_run.metadata["layoutType"] = "SINGLE_END" | ||
|
||
# Run code to test | ||
res = uniform_file_count_validator.validate_uniform_file_count(sequencing_run) | ||
|
||
# validate result | ||
self.assertFalse(res.is_valid(), "invalid run is being detected as valid") | ||
|
||
def test_run_invalid_name_error(self): | ||
# Build sequencing run from real data with common user error | ||
# One of the files has an extra character in the filename in a spot that is within spec to be parsed, | ||
# but results in mixing single end and paired end files in the same run. | ||
sample_sheet_path = path.join( | ||
path_to_module, "fake_ngs_data_name_error", "SampleSheet.csv") | ||
parser_instance = miseq_parser.Parser() | ||
sequencing_run = parser_instance.get_sequencing_run(sample_sheet_path) | ||
|
||
# Run code to test | ||
res = uniform_file_count_validator.validate_uniform_file_count(sequencing_run) | ||
|
||
# validate result | ||
self.assertFalse(res.is_valid(), "invalid run is being detected as valid") | ||
self.assertEqual( | ||
str(res.error_list), | ||
str([SequenceFileError( | ||
'File count for sample `02-2222` does not match the expected file count `2`. Please verify your data.' | ||
)])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters