integration of pharokka_multiplotter and pharokka-genpank-parser base…

…d on checkv completeness
replikation · Jul 29, 2024 · ac375fb · ac375fb
1 parent 542292f
commit ac375fb
Show file tree

Hide file tree

Showing 7 changed files with 80 additions and 1,004 deletions.
diff --git a/bin/pharokka_plotter_parser.py b/bin/pharokka_plotter_parser.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+import os
+import re
+import argparse
+
+## python decision_plotting_prepare.py --input read_until_FAP76673_e0481cad.csv --output first_steps.csv --chunksize 50
+
+parser = argparse.ArgumentParser(description='read until parser for decision_plotting_prepare')
+parser.add_argument(
+    '--input', required=True, help='choose inputfile')
+parser.add_argument(
+    '--contigs_to_extract', required=True, help='choose inputfile')
+
+
+args = parser.parse_args()
+
+# Define the input file and output extension
+input_file = args.input
+output_extension = '.gbk'
+contigs_file = args.contigs_to_extract
+
+
+def read_contigs_list(contigs_file):
+    """Read the list of contigs to extract from the file."""
+    with open(contigs_file, 'r') as file:
+        contigs = [line.strip() for line in file if line.strip()]
+    print(f"Contigs to extract: {contigs}")
+    return contigs
+
+def split_and_filter_file(input_file, output_extension, delimiter, contigs_to_extract):
+    """Split the input file by delimiter and filter sections based on contigs to extract."""
+    with open(input_file, 'r') as file:
+        content = file.read()
+
+    # Split the content based on the delimiter
+    parts = content.split(delimiter)
+    print(f"Number of parts after splitting: {len(parts)}")
+
+    output_generated = False
+    for part_index, part in enumerate(parts):
+        print(f"\nProcessing part {part_index + 1}/{len(parts)}:")
+        if part.strip() == "":
+            continue  # Skip empty parts
+
+          # Extract the ACCESSION line (e.g., ACCESSION   pos_phage_9)
+        match = re.search(r'^ACCESSION\s+(\S+)', part, re.MULTILINE)
+        if match:
+            accession = match.group(1).strip()
+            print(f"Found ACCESSION: '{accession}'")
+            if accession in contigs_to_extract:
+                output_file = f'{accession}{output_extension}'
+                print(f"Writing to file: {output_file}")
+                with open(output_file, 'w') as file:
+                    file.write(part.strip() + '\n//\n')  # Append // to the end
+                output_generated = True
+            else:
+                print(f"ACCESSION '{accession}' not in the contigs to extract: {contigs_to_extract}")
+        else:
+            print("ACCESSION line not found in part.")
+
+    if not output_generated:
+        print("No matching ACCESSION values found in the input file.")
+
+    print("Processing complete.")
+
+# Read contigs to extract from file
+contigs_to_extract = read_contigs_list(contigs_file)
+
+# Split and filter the input file
+split_and_filter_file(input_file, output_extension, delimiter='//', contigs_to_extract=contigs_to_extract)
diff --git a/bin/pharokka_plotter_parser.sh b/bin/pharokka_plotter_parser.sh
diff --git a/nextflow.config b/nextflow.config
@@ -16,6 +16,7 @@ params {
     setup = ''
     all_tools = false
     annotation_db = false
+    plot_completeness = '75.00'
 
     // folder structure
     output = 'results'

diff --git a/phage.nf b/phage.nf
@@ -349,6 +349,7 @@ def helpMSG() {
     ${c_yellow}Workflow control:${c_reset}
     --identify          only phage identification, skips analysis
     --annotate          only annotation, skips phage identification
+    --plot_completeness  pharokka (annotation) will plot Phage-contigs with CheckV-completeness > 75.00 (or you provide your cutoff value, e.g. 80.00)
 
     ${c_yellow}Databases, file, container behaviour:${c_reset}
     --databases         specifiy download location of databases