Skip to content

Commit

Permalink
Merge pull request #6 from PennChopMicrobiomeProgram/VT_edit
Browse files Browse the repository at this point in the history
Made changes to allow trimming on reverse coordinates
  • Loading branch information
scottdaniel authored Jun 22, 2021
2 parents 8bee15b + 7b54f87 commit dab46e1
Show file tree
Hide file tree
Showing 7 changed files with 1,713 additions and 1,899 deletions.
11 changes: 9 additions & 2 deletions primertrim/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

DEFAULT_BLAST_FIELDS = [
"qseqid", "sseqid", "pident", "length", "mismatch", "gapopen",
"qstart", "qend", "sstart", "send", "qlen", "slen", "qseq", "sseq",
"qstart", "qend", "sstart", "send", "qlen", "slen", "qseq", "sseq", "qstrand",
]

BLAST_FIELD_TYPES = {
Expand All @@ -22,6 +22,7 @@
"slen": int,
"qseq": str,
"sseq": str,
"qstrand": str,
}

BLAST_TO_VSEARCH = {
Expand All @@ -39,6 +40,7 @@
"slen": "ts",
"qseq": "qrow",
"sseq": "trow",
"qstrand": "qstrand",
}

def write_fasta(f, seqs):
Expand Down Expand Up @@ -75,13 +77,14 @@ def search(self, seqs, input_fp=None, output_fp=None, **kwargs):
for hit in self.parse(f):
yield hit

def _call(self, query_fp, output_fp, min_id=0.7, threads=None):
def _call(self, query_fp, output_fp, min_id=0.85, threads=None):
id_arg = "{:.3f}".format(min_id)
userfields_arg = "+".join(BLAST_TO_VSEARCH[f] for f in self.fields)
args = [
"vsearch",
"--usearch_global", query_fp,
"--minseqlength", "10",
"--mincols", "10",
"--id", id_arg,
"--wordlength", "4",
"--strand", "both",
Expand All @@ -108,4 +111,8 @@ def parse(self, f):
for field in self.fields:
fcn = BLAST_FIELD_TYPES[field]
res[field] = fcn(res[field])
if res["qstrand"] == "-":
qstart_temp = res["qlen"]-res["qend"]+1
res["qend"] = res["qlen"]-res["qstart"]+1
res["qstart"] = qstart_temp
yield res
7 changes: 6 additions & 1 deletion primertrim/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ def main(argv=None):
help=(
"Number of CPU threads to use during the alignment stage "
"(default: all the threads)"))
alignment_group.add_argument(
"--align_id", type=float, default=0.85,
help=(
"Minimum percent identity to consider a primer match in vsearch alignment."
"(default: %(default)s)"))
args = p.parse_args(argv)

if args.input_fastq is None:
Expand All @@ -87,7 +92,7 @@ def main(argv=None):
else:
temp_alignment_dir = tempfile.TemporaryDirectory()
alignment_dir = temp_alignment_dir.name
am = AlignmentMatcher(queryset, alignment_dir, args.threads)
am = AlignmentMatcher(queryset, alignment_dir, args.align_id, args.threads)
matchers.append(am)

trimmable_reads = TrimmableReads.from_fastq(args.input_fastq)
Expand Down
5 changes: 3 additions & 2 deletions primertrim/matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,11 +107,12 @@ def find_match(self, seq):


class AlignmentMatcher(Matcher):
def __init__(self, queryset, alignment_dir, cores=1):
def __init__(self, queryset, alignment_dir, align_id, cores=1):
self.queryset = queryset
assert(os.path.exists(alignment_dir))
assert(os.path.isdir(alignment_dir))
self.alignment_dir = alignment_dir
self.align_id = align_id
self.cores = cores

def _make_fp(self, filename):
Expand All @@ -132,7 +133,7 @@ def find_in_seqs(self, seqs):
a = VsearchAligner(subject_fp)
hits = a.search(
seqs.items(), query_fp, result_fp,
min_id=0.75, threads=self.cores)
min_id=self.align_id, threads=self.cores)

for hit in hits:
seq_id = hit["qseqid"]
Expand Down
2,376 changes: 1,078 additions & 1,298 deletions tests/data/trimmed_example.fastq

Large diffs are not rendered by default.

1,188 changes: 594 additions & 594 deletions tests/data/trimmed_example.log

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion tests/test_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,13 @@ def test_main_script(tmp_path):
"--log", log_fp,
"--mismatches", "0",
"--min-partial", "100",
"--alignment",
"--align_id", "0.7"
]
main(args)

expected_log_fp = str(DATA_DIR / "trimmed_example.log")
assert read_from(log_fp) == read_from(expected_log_fp)

expected_output_fp = str(DATA_DIR / "trimmed_example.fastq")
assert read_from(output_fp) == read_from(expected_output_fp)
assert read_from(output_fp) == read_from(expected_output_fp)
21 changes: 20 additions & 1 deletion tests/test_matcher.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from primertrim.matcher import (
PrimerMatch,
CompleteMatcher, PartialMatcher,
CompleteMatcher, PartialMatcher, AlignmentMatcher,
)

import os

def test_complete_match():
m = CompleteMatcher(["TTTTTT"], 1, False)
Expand All @@ -17,3 +18,21 @@ def test_partial_match():
assert m.find_match("AAAAGTCGT") == PrimerMatch("Partial", 0, 0, "AAAA")
assert m.find_match("GTCGAAAAA") == PrimerMatch("Partial", 4, 0, "AAAAA")
assert m.find_match("AAAGTCGGCT") == None # Length is 3, no match

def test_align_match(tmp_path):
align_fp = str(tmp_path / "vsearch_align_fp")
os.mkdir(align_fp)
test_dict = {"test": "GGGGGAAAAA",
"test2": "GGGGGAACAA",
"test3": "GGGGGGGAAAAA",
"test4": "GGGGGGGCAAAACCCCCTTTTT"}
p_match = [PrimerMatch("Alignment", 0, 0, "GGGGGAAAAA"),
PrimerMatch("Alignment", 0, 1, "GGGGGAACAA"),
PrimerMatch("Alignment", 2, 0, "GGGGGAAAAA"),
PrimerMatch("Alignment", 2, 1, "GGGGGCAAAACCCCCTTTTT")]
m = AlignmentMatcher(["GGGGGAAAAACCCCCTTTTT"], align_fp, 0.8)
m_gen = m.find_in_seqs(test_dict.items())
i = 0
for item in m_gen:
assert item[1] == p_match[i]
i += 1

0 comments on commit dab46e1

Please sign in to comment.