Skip to content

Commit

Permalink
Merge pull request #5 from kylebittinger/add-min-length
Browse files Browse the repository at this point in the history
Add min length command-line option
  • Loading branch information
scottdaniel authored May 21, 2021
2 parents a3d26e5 + 7574ae8 commit 8bee15b
Show file tree
Hide file tree
Showing 11 changed files with 117 additions and 51,505 deletions.
45 changes: 27 additions & 18 deletions primertrim/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
import sys
import tempfile

from .fastq import TrimmableReads
from .trimmable_reads import TrimmableReads
from .matcher import (
CompleteMatcher, PartialMatcher, AlignmentMatcher,
)
from .dna import deambiguate


def main(argv=None):
p = argparse.ArgumentParser()
p.add_argument(
Expand All @@ -21,10 +22,15 @@ def main(argv=None):
help="Input FASTQ file to be trimmed (default: standard input)")
io_group.add_argument(
"-o", "--output-fastq", type=argparse.FileType('w'),
help="Output fastq after trimming (default: standard output)")
help="Output FASTQ file after trimming (default: standard output)")
io_group.add_argument(
"--log", type=argparse.FileType('w'),
help="Log file of primers and location (default: not written)")
io_group.add_argument(
"--min-length", type=int, default=50,
help=(
"Minimum length of reads written to the output FASTQ file. "
"(default: %(default)s)"))

complete_group = p.add_argument_group("Complete, partial matching stages")
complete_group.add_argument(
Expand Down Expand Up @@ -93,19 +99,22 @@ def main(argv=None):
if matchobj is not None:
trimmable_reads.register_match(read_id, matchobj)

for desc, seq, qual in trimmable_reads.get_trimmed_reads():
if seq != "":
args.output_fastq.write("@{0}\n{1}\n+\n{2}\n".format(desc, seq, qual))

if args.log:
args.log.write("read_id\tmatch_type\ttrimmed_length\tmismatches\tobserved_primer\n")
for read_id, matchobj in trimmable_reads.matches.items():
if matchobj is None:
seq = trimmable_reads.seqs[read_id]
args.log.write("{0}\tNo match\t{1}\t\t\n".format(
read_id, len(seq)))
else:
args.log.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(
read_id, matchobj.method, matchobj.start,
matchobj.mismatches, matchobj.primerseq,
))
output_reads = trimmable_reads.output_reads(args.min_length)
write_fastq(args.output_fastq, output_reads)

output_loginfo = trimmable_reads.output_loginfo()
write_log(args.log, output_loginfo, trimmable_reads.loginfo_colnames)


def write_fastq(f, reads):
for desc, seq, qual in reads:
f.write("@{0}\n{1}\n+\n{2}\n".format(desc, seq, qual))


def write_log(f, loginfo, colnames=None):
if colnames:
f.write("\t".join(colnames))
f.write("\n")
for vals in loginfo:
f.write("\t".join(str(v) if v is not None else "" for v in vals))
f.write("\n")
20 changes: 18 additions & 2 deletions primertrim/fastq.py → primertrim/trimmable_reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def get_unmatched_seqs(self):
def register_match(self, read_id, matchobj):
self.matches[read_id] = matchobj

def get_trimmed_reads(self):
def output_reads(self, min_length=0):
for read_id in self.descs.keys():
matchobj = self.matches[read_id]
desc = self.descs[read_id]
Expand All @@ -34,7 +34,23 @@ def get_trimmed_reads(self):
if matchobj is not None:
seq = seq[:matchobj.start]
qual = qual[:matchobj.start]
yield (desc, seq, qual)
if len(seq) >= min_length:
yield (desc, seq, qual)

def output_loginfo(self):
for read_id, matchobj in self.matches.items():
if matchobj is None:
seq = self.seqs[read_id]
yield (read_id, "No match", len(seq), None, None)
else:
yield (
read_id, matchobj.method, matchobj.start,
matchobj.mismatches, matchobj.primerseq,
)

loginfo_colnames = [
"read_id", "match_type", "trimmed_length", "mismatches",
"observed_primer"]


def parse_fastq(f):
Expand Down
File renamed without changes.
19,700 changes: 0 additions & 19,700 deletions tests/data/no_primer_Sub10003.V1.sputum.redo_1mm_R1.fastq

This file was deleted.

5,055 changes: 0 additions & 5,055 deletions tests/data/no_primer_Sub10003.V1.sputum.redo_1mm_R1.log

This file was deleted.

20,216 changes: 0 additions & 20,216 deletions tests/data/no_primer_Sub10003.V1.sputum.redo_R2.fastq

This file was deleted.

5,054 changes: 0 additions & 5,054 deletions tests/data/no_primer_Sub10003.V1.sputum.redo_R2.log

This file was deleted.

Large diffs are not rendered by default.

File renamed without changes.
32 changes: 4 additions & 28 deletions tests/test_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,14 @@
DATA_DIR = Path(__file__).parent / "data"


def data_fp(filename):
return str(DATA_DIR / filename)


def read_from(filepath):
with open(filepath) as f:
res = f.readlines()
return res


def test_main_script(tmp_path):
input_fp = data_fp("Sub10003.V1.sputum.redo_R1.fastq")
input_fp = str(DATA_DIR / "example.fastq")
output_fp = str(tmp_path / "out.fastq")
log_fp = str(tmp_path / "out.log")
args = [
Expand All @@ -29,29 +26,8 @@ def test_main_script(tmp_path):
]
main(args)

expected_log_fp = data_fp("no_primer_Sub10003.V1.sputum.redo_R1.log")
assert read_from(log_fp) == read_from(expected_log_fp)

expected_output_fp = data_fp("no_primer_Sub10003.V1.sputum.redo_R1.fastq")
assert read_from(output_fp) == read_from(expected_output_fp)


def test_main_script_1mm(tmp_path):
input_fp = data_fp("Sub10003.V1.sputum.redo_R1.fastq")
output_fp = str(tmp_path / "out.fastq")
log_fp = str(tmp_path / "out.log")
args = [
"GCATCGATGAAGAACGCAGC",
"-i", input_fp,
"-o", output_fp,
"--log", log_fp,
"--mismatches", "1",
"--min-partial", "100",
]
main(args)

expected_log_fp = data_fp("no_primer_Sub10003.V1.sputum.redo_1mm_R1.log")
expected_log_fp = str(DATA_DIR / "trimmed_example.log")
assert read_from(log_fp) == read_from(expected_log_fp)

expected_output_fp = data_fp("no_primer_Sub10003.V1.sputum.redo_1mm_R1.fastq")
expected_output_fp = str(DATA_DIR / "trimmed_example.fastq")
assert read_from(output_fp) == read_from(expected_output_fp)
68 changes: 68 additions & 0 deletions tests/test_trimmable_reads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from primertrim.trimmable_reads import TrimmableReads
from primertrim.matcher import PrimerMatch


read1 = ("seq1", "ATGTCATGACTTGACTGCGG", "FFFFFFFFFFFFFFFFFFFF")
read2 = ("seq2", "AGTCACGCTGACTGCATTGA", "FFFFFFFFFFFFFFFFFFFF")
read3 = ("seq3", "TACGTCATGCATCGTAGTAA", "FFFFFFFFFFFFFFFFFFFF")

seq1 = ("seq1", "ATGTCATGACTTGACTGCGG")
seq2 = ("seq2", "AGTCACGCTGACTGCATTGA")
seq3 = ("seq3", "TACGTCATGCATCGTAGTAA")

log1 = ("seq1", "No match", 20, None, None)
log2 = ("seq2", "No match", 20, None, None)
log3 = ("seq3", "No match", 20, None, None)

read2_trim10 = ("seq2", "AGTCACGCTG", "FFFFFFFFFF")
log2_trim10 = ("seq2", "Complete", 10, 0, "ACTGCATTGA")
read2_trim0 = ("seq2", "", "")


def test_register_match():
t = TrimmableReads([read1, read2, read3])

assert list(t.get_unmatched_seqs()) == [seq1, seq2, seq3]

m = PrimerMatch("Complete", 10, 0, "ACTGCATTGA")
t.register_match("seq2", m)

assert list(t.get_unmatched_seqs()) == [seq1, seq3]


def test_output():
t = TrimmableReads([read1, read2, read3])

assert list(t.output_reads()) == [read1, read2, read3]

m = PrimerMatch("Complete", 10, 0, "ACTGCATTGA")
t.register_match("seq2", m)

assert list(t.output_reads()) == [read1, read2_trim10, read3]
assert list(t.output_loginfo()) == [log1, log2_trim10, log3]


def test_output_min_length():
t = TrimmableReads([read1, read2, read3])
m = PrimerMatch("Complete", 10, 0, "ACTGCATTGA")
t.register_match("seq2", m)

# All reads written out
assert list(t.output_reads(min_length=0)) == [read1, read2_trim10, read3]
# Removes read2
assert list(t.output_reads(min_length=15)) == [read1, read3]
# No reads are long enough
assert list(t.output_reads(min_length=30)) == []


def test_output_zero_length():
t = TrimmableReads([read1, read2, read3])
m = PrimerMatch("Complete", 0, 0, "ACTGCATTGA")
t.register_match("seq2", m)

# All reads written out
assert list(t.output_reads(min_length=0)) == [read1, read2_trim0, read3]
# Negative min_length writes out all reads
assert list(t.output_reads(min_length=-5)) == [read1, read2_trim0, read3]
# Removes read2
assert list(t.output_reads(min_length=1)) == [read1, read3]

0 comments on commit 8bee15b

Please sign in to comment.