Skip to content

Commit

Permalink
Merge pull request #139 from phac-nml/file_count_validation
Browse files Browse the repository at this point in the history
File count validation
  • Loading branch information
ericenns authored Jun 29, 2022
2 parents dd19c68 + ab4619a commit 0503661
Show file tree
Hide file tree
Showing 17 changed files with 300 additions and 7 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@ Changes
=======

Beta 0.8.2
---------
----------
Bug Fixes:
* config file now evaluates any capitalization of True/False and displays errors to the user if unable to parse.
* Fixed command line help text inconsistency
* Catch mixed paired end and single end files in a sequencing run at the validation step and show user which samples are incorrect.

Beta 0.8.1
---------
Expand Down
2 changes: 2 additions & 0 deletions docs/developers/objects.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ It contains a `project_list` which relate to the IRIDA projects that samples wil

The `metadata` dict is mostly unused, but must include `layoutType` as either `PAIRED_END` or `SINGLE_END`, this determines if the samples within the sequencing run are paired end or single end reads.

There is also a helper method `is_paired_end` that checks against the `metadata` dict and returns a boolean.

### Project `model/project.py`

The `Project` object relates to a project on IRIDA.
Expand Down
2 changes: 1 addition & 1 deletion iridauploader/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
VERSION_NUMBER = "0.8.1"
VERSION_NUMBER = "0.8.2"
8 changes: 7 additions & 1 deletion iridauploader/core/parsing_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import iridauploader.config as config
import iridauploader.parsers as parsers
from iridauploader.core import model_validator, file_size_validator
from iridauploader.core import model_validator, file_size_validator, uniform_file_count_validator


def get_parser_from_config():
Expand Down Expand Up @@ -46,6 +46,12 @@ def parse_and_validate(directory):
logging.info("parsing_handler:Exception while validating Sequencing Run")
raise parsers.exceptions.ValidationError("Sequencing Run is not valid", validation_result)

logging.info("Validating file counts match sequencing run")
validation_result = uniform_file_count_validator.validate_uniform_file_count(sequencing_run)
if not validation_result.is_valid():
logging.info("parsing_handler:Exception while validating Sequencing Run")
raise parsers.exceptions.ValidationError("Sequencing Run is not valid", validation_result)

logging.info("Validating files contain data")
validation_result = file_size_validator.validate_file_size_minimum(sequencing_run)
if not validation_result.is_valid():
Expand Down
36 changes: 36 additions & 0 deletions iridauploader/core/uniform_file_count_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import iridauploader.model as model
from iridauploader.parsers.exceptions.sequence_file_error import SequenceFileError


def validate_uniform_file_count(sequencing_run):
"""
Validate the files in a SequencingRun object are all either in pairs, or single files
:param sequencing_run: SequencingRun object to validate
:return: ValidationResult object with list of errors if any
"""
paired = sequencing_run.is_paired_end()
expected_file_count = 2 if paired else 1

validation_result = model.ValidationResult()

for p in sequencing_run.project_list:
for s in p.sample_list:
# do validation of files
if not _matching_find_count(s, paired):
error_msg = "File count for sample `{}` does not match the expected file count `{}`. " \
"Please verify your data.".format(s.sample_name, expected_file_count)
validation_result.add_error(SequenceFileError(error_msg))

return validation_result


def _matching_find_count(sample, paired):
"""
checks paired end / single end file matching on sample
:param sample: Sample object
:param paired: boolean
:return: boolean
"""
return sample.sequence_file.is_paired_end() == paired
8 changes: 8 additions & 0 deletions iridauploader/model/sequencing_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,11 @@ def upload_route_string(self, sequencing_run_type):

def get_dict(self):
return self.__dict__

def is_paired_end(self):
"""
Checks the metadata field to see if run is paired end or single end
:return: boolean
"""
layout_type = self.metadata['layoutType']
return True if layout_type == "PAIRED_END" else False
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
24 changes: 24 additions & 0 deletions iridauploader/tests/core/fake_ngs_data_name_error/SampleSheet.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
[Header]
IEMFileVersion,4
Investigator Name,Denji
Experiment Name,1
Date,12/3/2018
Workflow,GenerateFASTQ
Application,FASTQ Only
Assay,Nextera XT
Description,Chainsaw
Chemistry,Moter Oil

[Reads]
251
250

[Settings]
ReverseComplement,0
Adapter,AAAAGGGGAAAAGGGGAAA

[Data]
Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description
01-1111,01-1111,1,01,N01,AAAAAAAA,S01,TTTTTTTT,6,Super bug
02-2222,02-2222,2,02,N02,GGGGGGGG,S02,CCCCCCCC,6,Scary bug
03-3333,03-3333,3,03,N03,CCCCCCCC,S03,GGGGGGGG,6,Deadly bug
11 changes: 9 additions & 2 deletions iridauploader/tests/core/test_parsing_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,16 @@ class TestParseAndValidate(unittest.TestCase):
def setUp(self):
print("\nStarting " + self.__module__ + ": " + self._testMethodName)

@patch("iridauploader.core.uniform_file_count_validator.validate_uniform_file_count")
@patch("iridauploader.core.file_size_validator.validate_file_size_minimum")
@patch("iridauploader.core.parsing_handler.model_validator.validate_sequencing_run")
@patch("iridauploader.core.parsing_handler.get_parser_from_config")
def test_all_functions_called(self, mock_get_parser, mock_validate_model, mock_validate_file_size):
def test_all_functions_called(
self,
mock_get_parser,
mock_validate_model,
mock_validate_file_size,
mock_validate_uniform_file_count):
"""
Makes sure that all relevant functions are called so that it will parse and validate fully
:return:
Expand All @@ -61,10 +67,11 @@ def test_all_functions_called(self, mock_get_parser, mock_validate_model, mock_v
mock_get_parser.side_effect = [mock_parser_instance]

mock_validation_result = unittest.mock.MagicMock()
mock_validation_result.is_valid.side_effect = [True, True]
mock_validation_result.is_valid.side_effect = [True, True, True]

mock_validate_model.side_effect = [mock_validation_result]
mock_validate_file_size.side_effect = [mock_validation_result]
mock_validate_uniform_file_count.side_effect = [mock_validation_result]

res = parsing_handler.parse_and_validate("mock_directory")

Expand Down
209 changes: 209 additions & 0 deletions iridauploader/tests/core/test_uniform_file_count_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
import unittest
from os import path


from iridauploader.core import uniform_file_count_validator
from iridauploader.parsers.miseq import parser as miseq_parser
from iridauploader.parsers.exceptions import SequenceFileError
from iridauploader import model

path_to_module = path.abspath(path.dirname(__file__))
if len(path_to_module) == 0:
path_to_module = '.'


class TestValidateFileSizeMinimum(unittest.TestCase):
"""
Testing the validate_uniform_file_count function
"""

def setUp(self):
print("\nStarting " + self.__module__ + ": " + self._testMethodName)

@staticmethod
def _make_seq_run_paired():
"""
Make a sequencing run pointed at real data for the tests
This dataset is a paired end run
:return: SequencingRun object
"""
files_1 = model.SequenceFile([
path.join(path_to_module,
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "01-1111_S1_L001_R1_001.fastq.gz"),
path.join(path_to_module,
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "01-1111_S1_L001_R2_001.fastq.gz"),
])
files_2 = model.SequenceFile([
path.join(path_to_module,
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "02-2222_S1_L001_R1_001.fastq.gz"),
path.join(path_to_module,
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "02-2222_S1_L001_R2_001.fastq.gz"),
])
files_3 = model.SequenceFile([
path.join(path_to_module,
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "03-3333_S1_L001_R1_001.fastq.gz"),
path.join(path_to_module,
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "03-3333_S1_L001_R2_001.fastq.gz"),
])
sample_1 = model.Sample("test_sample", "description", 1)
sample_1.sequence_file = files_1
sample_2 = model.Sample("test_sample", "description", 1)
sample_2.sequence_file = files_2
sample_3 = model.Sample("test_sample", "description", 1)
sample_3.sequence_file = files_3
project = model.Project("test_project", [sample_1, sample_2, sample_3], "description")
sequencing_run = model.SequencingRun({"layoutType": "PAIRED_END"}, [project], "miseq")
return sequencing_run

@staticmethod
def _make_seq_run_single():
"""
Make a sequencing run pointed at real data for the tests
This dataset is a single end run
:return: SequencingRun object
"""
files_1 = model.SequenceFile([
path.join(path_to_module,
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "01-1111_S1_L001_R1_001.fastq.gz"),
])
files_2 = model.SequenceFile([
path.join(path_to_module,
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "02-2222_S1_L001_R1_001.fastq.gz"),
])
files_3 = model.SequenceFile([
path.join(path_to_module,
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "03-3333_S1_L001_R1_001.fastq.gz"),
])
sample_1 = model.Sample("test_sample", "description", 1)
sample_1.sequence_file = files_1
sample_2 = model.Sample("test_sample", "description", 1)
sample_2.sequence_file = files_2
sample_3 = model.Sample("test_sample", "description", 1)
sample_3.sequence_file = files_3
project = model.Project("test_project", [sample_1, sample_2, sample_3], "description")
sequencing_run = model.SequencingRun({"layoutType": "SINGLE_END"}, [project], "miseq")
return sequencing_run

@staticmethod
def _make_seq_run_mixed():
"""
Make a sequencing run pointed at real data for the tests
This dataset mixes single end and paired end runs, but identifies as paired end
:return: SequencingRun object
"""
files_1 = model.SequenceFile([
path.join(path_to_module,
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "01-1111_S1_L001_R1_001.fastq.gz"),
path.join(path_to_module,
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "01-1111_S1_L001_R2_001.fastq.gz"),
])
files_2 = model.SequenceFile([
path.join(path_to_module,
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "02-2222_S1_L001_R1_001.fastq.gz"),
])
files_3 = model.SequenceFile([
path.join(path_to_module,
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "03-3333_S1_L001_R1_001.fastq.gz"),
])
sample_1 = model.Sample("test_sample", "description", 1)
sample_1.sequence_file = files_1
sample_2 = model.Sample("test_sample", "description", 1)
sample_2.sequence_file = files_2
sample_3 = model.Sample("test_sample", "description", 1)
sample_3.sequence_file = files_3
project = model.Project("test_project", [sample_1, sample_2, sample_3], "description")
sequencing_run = model.SequencingRun({"layoutType": "PAIRED_END"}, [project], "miseq")
return sequencing_run

@staticmethod
def _make_seq_run_paired_with_incorrect_file_name():
"""
Make a sequencing run pointed at real data for the tests
This dataset is a paired end run with a common user error of a misnamed file
:return: SequencingRun object
"""
files_1 = model.SequenceFile([
path.join(path_to_module,
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "01-1111_S1_L001_R1_001.fastq.gz"),
path.join(path_to_module,
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "01-1111_S1_L001_R2_001.fastq.gz"),
])
files_2 = model.SequenceFile([
path.join(path_to_module,
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "02-2222_S1_L001_R1_001.fastq.gz"),
path.join(path_to_module,
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "02-2222x_S1_L001_R2_001.fastq.gz"),
])
files_3 = model.SequenceFile([
path.join(path_to_module,
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "03-3333_S1_L001_R1_001.fastq.gz"),
path.join(path_to_module,
"fake_ngs_data", "Data", "Intensities", "BaseCalls", "03-3333_S1_L001_R2_001.fastq.gz"),
])
sample_1 = model.Sample("test_sample", "description", 1)
sample_1.sequence_file = files_1
sample_2 = model.Sample("test_sample", "description", 1)
sample_2.sequence_file = files_2
sample_3 = model.Sample("test_sample", "description", 1)
sample_3.sequence_file = files_3
project = model.Project("test_project", [sample_1, sample_2, sample_3], "description")
sequencing_run = model.SequencingRun({"layoutType": "PAIRED_END"}, [project], "miseq")
return sequencing_run

def test_run_valid_paired(self):
sequencing_run = self._make_seq_run_paired()

# Run code to test
res = uniform_file_count_validator.validate_uniform_file_count(sequencing_run)

# validate result
self.assertTrue(res.is_valid(), "valid run is being detected as invalid")

def test_run_valid_single(self):
sequencing_run = self._make_seq_run_single()

# Run code to test
res = uniform_file_count_validator.validate_uniform_file_count(sequencing_run)

# validate result
self.assertTrue(res.is_valid(), "valid run is being detected as invalid")

def test_run_valid_mixed_expecting_paired(self):
sequencing_run = self._make_seq_run_mixed()

# Run code to test
res = uniform_file_count_validator.validate_uniform_file_count(sequencing_run)

# validate result
self.assertFalse(res.is_valid(), "invalid run is being detected as valid")

def test_run_valid_mixed_expecting_single(self):
sequencing_run = self._make_seq_run_mixed()
# change metadata to expect single end run
sequencing_run.metadata["layoutType"] = "SINGLE_END"

# Run code to test
res = uniform_file_count_validator.validate_uniform_file_count(sequencing_run)

# validate result
self.assertFalse(res.is_valid(), "invalid run is being detected as valid")

def test_run_invalid_name_error(self):
# Build sequencing run from real data with common user error
# One of the files has an extra character in the filename in a spot that is within spec to be parsed,
# but results in mixing single end and paired end files in the same run.
sample_sheet_path = path.join(
path_to_module, "fake_ngs_data_name_error", "SampleSheet.csv")
parser_instance = miseq_parser.Parser()
sequencing_run = parser_instance.get_sequencing_run(sample_sheet_path)

# Run code to test
res = uniform_file_count_validator.validate_uniform_file_count(sequencing_run)

# validate result
self.assertFalse(res.is_valid(), "invalid run is being detected as valid")
self.assertEqual(
str(res.error_list),
str([SequenceFileError(
'File count for sample `02-2222` does not match the expected file count `2`. Please verify your data.'
)]))
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setuptools.setup(
name='iridauploader',
version='0.8.1',
version='0.8.2',
description='IRIDA uploader: upload NGS data to IRIDA system',
url='https://github.com/phac-nml/irida-uploader',
author='Jeffrey Thiessen',
Expand Down
2 changes: 1 addition & 1 deletion windows-installer.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[Application]
name=IRIDA Uploader GUI
version=0.8.1
version=0.8.2
entry_point=iridauploader.gui.gui:main
icon=iridauploader/gui/images/icon.ico
# Uncomment this to have a console show alongside the application
Expand Down

0 comments on commit 0503661

Please sign in to comment.