Merge pull request #6 from PennChopMicrobiomeProgram/VT_edit

Made changes to allow trimming on reverse coordinates
PennChopMicrobiomeProgram · Jun 22, 2021 · dab46e1 · dab46e1
2 parents 8bee15b + 7b54f87
commit dab46e1
Show file tree

Hide file tree

Showing 7 changed files with 1,713 additions and 1,899 deletions.
diff --git a/primertrim/align.py b/primertrim/align.py
@@ -4,7 +4,7 @@
 
 DEFAULT_BLAST_FIELDS = [
     "qseqid", "sseqid", "pident", "length", "mismatch", "gapopen",
-    "qstart", "qend", "sstart", "send", "qlen", "slen", "qseq", "sseq",
+    "qstart", "qend", "sstart", "send", "qlen", "slen", "qseq", "sseq", "qstrand",
 ]
 
 BLAST_FIELD_TYPES = {
@@ -22,6 +22,7 @@
     "slen": int,
     "qseq": str,
     "sseq": str,
+    "qstrand": str,
 }
 
 BLAST_TO_VSEARCH = {
@@ -39,6 +40,7 @@
     "slen": "ts",
     "qseq": "qrow",
     "sseq": "trow",
+    "qstrand": "qstrand",
 }
 
 def write_fasta(f, seqs):
@@ -75,13 +77,14 @@ def search(self, seqs, input_fp=None, output_fp=None, **kwargs):
             for hit in self.parse(f):
                 yield hit
 
-    def _call(self, query_fp, output_fp, min_id=0.7, threads=None):
+    def _call(self, query_fp, output_fp, min_id=0.85, threads=None):
         id_arg = "{:.3f}".format(min_id)
         userfields_arg = "+".join(BLAST_TO_VSEARCH[f] for f in self.fields)
         args = [
             "vsearch",
             "--usearch_global", query_fp,
             "--minseqlength", "10",
+            "--mincols", "10",
             "--id", id_arg,
             "--wordlength", "4",
             "--strand", "both",
@@ -108,4 +111,8 @@ def parse(self, f):
                 for field in self.fields:
                     fcn = BLAST_FIELD_TYPES[field]
                     res[field] = fcn(res[field])
+            if res["qstrand"] == "-":
+                qstart_temp = res["qlen"]-res["qend"]+1
+                res["qend"] = res["qlen"]-res["qstart"]+1
+                res["qstart"] = qstart_temp
             yield res
diff --git a/primertrim/command.py b/primertrim/command.py
@@ -61,6 +61,11 @@ def main(argv=None):
         help=(
             "Number of CPU threads to use during the alignment stage "
             "(default: all the threads)"))
+    alignment_group.add_argument(
+        "--align_id", type=float, default=0.85,
+        help=(
+            "Minimum percent identity to consider a primer match in vsearch alignment."
+            "(default: %(default)s)"))
     args = p.parse_args(argv)
 
     if args.input_fastq is None:
@@ -87,7 +92,7 @@ def main(argv=None):
         else:
             temp_alignment_dir = tempfile.TemporaryDirectory()
             alignment_dir = temp_alignment_dir.name
-        am = AlignmentMatcher(queryset, alignment_dir, args.threads)
+        am = AlignmentMatcher(queryset, alignment_dir, args.align_id, args.threads)
         matchers.append(am)
 
     trimmable_reads = TrimmableReads.from_fastq(args.input_fastq)

diff --git a/primertrim/matcher.py b/primertrim/matcher.py
@@ -107,11 +107,12 @@ def find_match(self, seq):
 
 
 class AlignmentMatcher(Matcher):
-    def __init__(self, queryset, alignment_dir, cores=1):
+    def __init__(self, queryset, alignment_dir, align_id, cores=1):
         self.queryset = queryset
         assert(os.path.exists(alignment_dir))
         assert(os.path.isdir(alignment_dir))
         self.alignment_dir = alignment_dir
+        self.align_id = align_id
         self.cores = cores
 
     def _make_fp(self, filename):
@@ -132,7 +133,7 @@ def find_in_seqs(self, seqs):
         a = VsearchAligner(subject_fp)
         hits = a.search(
             seqs.items(), query_fp, result_fp,
-            min_id=0.75, threads=self.cores)
+            min_id=self.align_id, threads=self.cores)
 
         for hit in hits:
             seq_id = hit["qseqid"]

diff --git a/tests/data/trimmed_example.fastq b/tests/data/trimmed_example.fastq
diff --git a/tests/data/trimmed_example.log b/tests/data/trimmed_example.log
diff --git a/tests/test_command.py b/tests/test_command.py
@@ -23,11 +23,13 @@ def test_main_script(tmp_path):
         "--log", log_fp,
         "--mismatches", "0",
         "--min-partial", "100",
+        "--alignment",
+        "--align_id", "0.7"
     ]
     main(args)
 
     expected_log_fp = str(DATA_DIR / "trimmed_example.log")
     assert read_from(log_fp) == read_from(expected_log_fp)
 
     expected_output_fp = str(DATA_DIR / "trimmed_example.fastq")
-    assert read_from(output_fp) == read_from(expected_output_fp)
+    assert read_from(output_fp) == read_from(expected_output_fp)
diff --git a/tests/test_matcher.py b/tests/test_matcher.py
@@ -1,8 +1,9 @@
 from primertrim.matcher import (
     PrimerMatch,
-    CompleteMatcher, PartialMatcher,
+    CompleteMatcher, PartialMatcher, AlignmentMatcher,
 )
 
+import os
 
 def test_complete_match():
     m = CompleteMatcher(["TTTTTT"], 1, False)
@@ -17,3 +18,21 @@ def test_partial_match():
     assert m.find_match("AAAAGTCGT") == PrimerMatch("Partial", 0, 0, "AAAA")
     assert m.find_match("GTCGAAAAA") == PrimerMatch("Partial", 4, 0, "AAAAA")
     assert m.find_match("AAAGTCGGCT") == None # Length is 3, no match
+
+def test_align_match(tmp_path):
+    align_fp = str(tmp_path / "vsearch_align_fp")
+    os.mkdir(align_fp)
+    test_dict = {"test":  "GGGGGAAAAA",
+                 "test2": "GGGGGAACAA",
+                 "test3": "GGGGGGGAAAAA",
+                 "test4": "GGGGGGGCAAAACCCCCTTTTT"}
+    p_match = [PrimerMatch("Alignment", 0, 0, "GGGGGAAAAA"),
+               PrimerMatch("Alignment", 0, 1, "GGGGGAACAA"),
+               PrimerMatch("Alignment", 2, 0, "GGGGGAAAAA"),
+               PrimerMatch("Alignment", 2, 1, "GGGGGCAAAACCCCCTTTTT")]
+    m = AlignmentMatcher(["GGGGGAAAAACCCCCTTTTT"], align_fp, 0.8)
+    m_gen = m.find_in_seqs(test_dict.items())
+    i = 0
+    for item in m_gen:
+        assert item[1] == p_match[i]
+        i += 1