Merge pull request #50 from Darcy220606/dev

merge v 0.1.6 into main
Darcy220606 · Nov 2, 2022 · 8953acc · 8953acc
2 parents 45f073f + e6de68a
commit 8953acc
Show file tree

Hide file tree

Showing 13 changed files with 134 additions and 131 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -9,4 +9,8 @@ v<0.1.4>, <18.10.2022>
 v<0.1.5>, <27.10.2022> -- Initial release.
     -- adapt reading of hmmer_hmmsearch output to deal with varying header lines
     -- fix syntax in "if" statements in "check_input.py"
-    -- include "check_faa_path" function, to find .faa files also in subdirectories
+    -- include "check_faa_path" function, to find .faa files also in subdirectories
+v<0.1.6>, <02.11.2022>
+    -- Included the HTML output for the complete summary
+    -- add option --threads for diamond (make database and alignment)
+    -- included check if database was downloaded once to not download again
diff --git a/README.md b/README.md
@@ -64,6 +64,7 @@ ampcombi \
 
 Here the head folder containing output files has to be given. AMPcombi finds and summarizes the output files from different tools, if the folder is structured  and named as: `/result_folder/toolsubdir/samplesubdir/sample.tool.filetype`. 
  - Note that the filetype ending might vary and can be specified with `--tooldict`, if it is different from the default. When passing a dictionary via command line, this has to be done as a string with single quotes `' '` and the dictionary keys and items with double quotes `" "`. i.e. `'{"key1":"item1", "key2":"item2"}'`
+- Note that `--sample_list` can also be given if only specfic samples are needed from the driectory.
 
 The path to the folder containing the respective protein fasta files has to be provided with `--faa_folder`. The files have to be named with `<samplename>.faa`.
 
@@ -112,8 +113,9 @@ The path to the folder containing the respective protein fasta files has to be p
 | --faa_folder  | path to the folder containing the samples` .faa files, Filenames have to contain the corresponding sample-name, i.e. sample_1.faa | ./test_faa/ | ./faa_files/|
 | --tooldict | dictionary of AMP-tools and their respective output file endings | '{"ampir":"ampir.tsv", "amplify":"amplify.tsv", "macrel":"macrel.tsv", "hmmer_hmmsearch":"hmmsearch.txt", "ensembleamppred":"ensembleamppred.txt"}' | - |
 | --amp_database | path to the folder containing the reference database files: (1) a fasta file with <.fasta> file extension and (2) the corresponding table with with functional and taxonomic classifications in <.tsv> file extension | [DRAMP 'general amps'](http://dramp.cpu-bioinfor.org/downloads/) database | ./amp_ref_database/ |
-| --complete_summary | Concatenates all samples' summarized tables into one | False | True |
+| --complete_summary | concatenates all samples' summarized tables into one and generates both 'csv' and interactive 'html' files | False | True |
 | --log  | print messages into log file instead of stdout | False | True |
+| --threads  | adjust the number of threads required for DIAMOND alignemnt depending on the computing resources available  | 4 | 32 |
 | --version  | print the version number into stdout | - | 0.1.4 |
 
  - Note: The fasta file corresponding to the AMP database should not contain any characters other than ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
@@ -136,6 +138,7 @@ The output will be written into your working directory, containing the following
 |   ├── sample_2_ampcombi.csv
 |   └── sample_2_diamond_matches.txt
 ├── AMPcombi_summary.csv
+├── AMPcombi_summary.html
 └── ampcombi.log
 ```
 

diff --git a/ampcombi/HTML.R b/ampcombi/HTML.R
@@ -0,0 +1,72 @@
+#!/usr/bin/env Rscript
+
+##############################
+# Rscript to visualise the complete summary tables generated by AMPcombi ####
+##############################
+# Date ####
+# October, 19 2022
+##############################
+# Authors ####
+# Anan Ibrahim - ananhamido@hotmail.com - @darcy220606
+# Louisa Perelo - louperelo@gmail.com - @louperelo
+##############################
+# Working_directory ####
+setwd(getwd())
+##############################
+# Libraries used + arguments ####
+if (!require("dplyr")) install.packages('dplyr')
+if (!require("DT")) install.packages('DT')
+if (!require("optparse")) install.packages('optparse')
+if (!require("htmlwidgets")) install.packages('htmlwidgets')
+
+library("dplyr")
+library("DT")
+library("optparse")
+library("htmlwidgets")
+
+option_list = list(
+  make_option(c("-f", "--file"), type="character", default="AMPcombi_summary.csv",
+              help="AMpcombi complete summary table [default= %default]", metavar="character"),
+  make_option(c("-o", "--out"), type="character", default="AMPcombi_summary.html",
+              help="Provide the name of the output file [default= %default]", metavar="character"));
+# Turns warnings off
+#options(warn=-1)
+opt_parser = OptionParser(option_list=option_list);
+opt = parse_args(opt_parser);
+
+##############################
+#Generate HTML interactive files ####
+table <-
+  readr::read_csv(opt$file,show_col_types = FALSE) %>%
+  unique()
+
+result<-datatable(table,
+          class = 'cell-border stripe',     ## add column border
+          options = list( paging = TRUE,    ## paginate the output
+                          pageLength = 100, ## number of rows to output for each page
+                          scrollX = TRUE,   ## enable scrolling on X axis
+                          scrollY = TRUE,   ## enable scrolling on Y axis
+                          autoWidth = TRUE, ## use smart column width handling
+                          #width = 100,
+                          #height=100,
+                          server = FALSE,   ## use client-side processing only load the 100 on display
+                          dom = 'Bfrtip',
+                          language = list(sSearch = "Keyword look-up:"),
+                          #bordered = TRUE,
+                          buttons = c('csv', 'excel'), ## the user can just download what on display because server=TRUE
+                          columnDefs = list(list(targets = '_all', className = 'dt-center'),
+                                            list(targets='aa_sequence', visible=TRUE, width='20'))),
+          extensions = 'Buttons',
+          selection = 'multiple',         ## enable selection of a single row
+          filter = 'top',                 ## include column filters at the bottom
+          rownames = FALSE                ## don't show row numbers/names
+          )
+
+# Change the HTML size to fill the browser 
+result$sizingPolicy$defaultWidth<-"100%"
+
+htmlwidgets::saveWidget(result, opt$out, selfcontained = FALSE)
+
+# CLean up the library folder created
+unlink("AMPcombi_summary_files", recursive = TRUE)
+##############################
diff --git a/ampcombi/amp_database.py b/ampcombi/amp_database.py
@@ -44,28 +44,28 @@ def download_DRAMP(db):
 ########################################
 #  FUNCTION: CREATE DIAMOND COMPATIBLE DATBASE FORMATS
 #########################################
-def create_diamond_ref_db(db):
+def create_diamond_ref_db(db,threads):
     cwd = os.getcwd()
     for file in os.listdir(db):
         if file.endswith('.fasta'):
             path = os.path.join(os.path.abspath(db) + '/' + file)
             os.chdir(db)
             #process = subprocess.Popen([f'{scripts_path}/diamond_makedb.sh', path])
-            subprocess.run('diamond_makedb.sh', text=True, input=path)
+            subprocess.run('diamond_makedb.sh', text=True, input=f'{path}\n{threads}')
             os.chdir(cwd)
             print
             return path
 
 ########################################
 #  FUNCTION: DIAMOND ALIGNMENT
 #########################################
-def diamond_alignment(db, amp_faa_paths, amp_matches):
+def diamond_alignment(db, amp_faa_paths, amp_matches,threads):
     #create temp folder and delete at the end
     cwd = os.getcwd()
     for path in amp_faa_paths:
         # align the query with the database
         temp = tempfile.mkdtemp()
-        subprocess.run('diamond_alignment.sh', text=True, input=f'{path}\n{temp}\n{db}')
+        subprocess.run('diamond_alignment.sh', text=True, input=f'{path}\n{temp}\n{db}\n{threads}')
         shutil.move(temp+'/diamond_matches.tsv', amp_matches)
         shutil.rmtree(temp)
         # mege the diamond_alignment with the ref_db table

diff --git a/ampcombi/ampcombi.py b/ampcombi/ampcombi.py
@@ -13,6 +13,7 @@
 from check_input import *
 from amp_database import *
 from print_header import *
+from visualise_complete_summary import *
 
 # Define input arguments:
 parser = argparse.ArgumentParser(prog = 'ampcombi', formatter_class=argparse.RawDescriptionHelpFormatter,
@@ -41,10 +42,12 @@
                     type=str, default='{"ampir":"ampir.tsv", "amplify":"amplify.tsv", "macrel":"macrel.tsv", "neubi":"neubi.fasta", "hmmer_hmmsearch":"hmmsearch.txt", "ensembleamppred":"ensembleamppred.txt"}')
 parser.add_argument("--amp_database", dest="ref_db", nargs='?', help="Enter the path to the folder containing the reference database files (.fa and .tsv); a fasta file and the corresponding table with functional and taxonomic classifications. \n (default: DRAMP database)",
                     type=str, default=None)
-parser.add_argument("--complete_summary", dest="complete", nargs='?', help="Concatenates all sample summaries to one final summary",
+parser.add_argument("--complete_summary", dest="complete", nargs='?', help="Concatenates all sample summaries to one final summary and outputs both csv and interactive html files",
                     type=bool, default=False)
 parser.add_argument("--log", dest="log_file", nargs='?', help="Silences the standard output and captures it in a log file)",
                     type=bool, default=False)
+parser.add_argument("--threads", dest="cores", nargs='?', help="Changes the threads used for DIAMOND alignment (default: %(default)s)",
+                    type=bool, default='4')
 parser.add_argument('--version', action='version', version='%(prog)s ' + __version__)
 
 # get command line arguments
@@ -59,6 +62,7 @@
 tooldict = json.loads(args.tools)
 database = args.ref_db
 complete_summary = args.complete
+threads = args.cores
 
 # additional variables
 # extract list of tools from input dictionary. If not given, default dict contains all possible tools
@@ -75,10 +79,10 @@
 def main_workflow():
     # print AMPcombi header
     print_header()
-    # check input parameters
-    check_input_complete(path, samplelist_in, filepaths_in, tools)
     # check input sample-list and create sample-list if input empty
     samplelist = check_samplelist(samplelist_in, tools, path)
+    # check input parameters
+    check_input_complete(path, samplelist, filepaths_in, tools)
     # check input filepaths and create list of list of filepaths per sample if input empty
     filepaths = check_pathlist(filepaths_in, samplelist, fileending, path)
     # check amp_ref_database filepaths and create a directory if input empty
@@ -89,7 +93,7 @@ def main_workflow():
 
     # generate summary for each sample
     amp_faa_paths = []
-    create_diamond_ref_db(db)
+    create_diamond_ref_db(db,threads)
     for i in range(0, len(samplelist)):
         main_list = []
         print('\n ########################################################## ')
@@ -107,8 +111,8 @@ def main_workflow():
         amp_faa_paths.append(out_path)
         print(f'The fasta containing AMP sequences for {samplelist[i]} was saved to {samplelist[i]}/ \n')
         amp_matches = samplelist[i] +'/'+samplelist[i]+'_diamond_matches.txt'
-        print(f'The diamond alignment for {samplelist[i]} in process....')
-        diamond_df = diamond_alignment(db, amp_faa_paths, amp_matches)
+        print(f'The diamond alignment for {samplelist[i]} in progress ....')
+        diamond_df = diamond_alignment(db, amp_faa_paths, amp_matches, threads)
         print(f'The diamond alignment for {samplelist[i]} was saved to {samplelist[i]}/.')
         # Merge summary_df and diamond_df
         sample_summary_df = pd.merge(summary_df, diamond_df, on = 'contig_id', how='left')
@@ -121,10 +125,11 @@ def main_workflow():
         # concatenate the sample summary to the complete summary and overwrite it
             complete_summary_df = pd.concat([complete_summary_df, sample_summary_df])
             complete_summary_df.to_csv('AMPcombi_summary.csv', sep=',', index=False)
+            html_generator() 
         else: 
             continue
     if (complete_summary):
-        print(f'\n FINISHED: The AMPcombi_summary.csv file was saved to your current working directory.')
+        print(f'\n FINISHED: The AMPcombi_summary.csv and AMPcombi_summary.html files were saved to your current working directory.')
     else: 
         print(f'\n FINISHED: AMPcombi created summaries for all input samples.')
 

diff --git a/ampcombi/check_input.py b/ampcombi/check_input.py
@@ -39,20 +39,25 @@ def check_faa_path(faa_path, samplename):
     return path_list[0]
 
 def check_ref_database(database):
-    if(database==None):
+    if((database==None) and (not os.path.exists('amp_ref_database'))):
         print('<--AMP_database> was not given, the current DRAMP general-AMP database will be downloaded and used')
         database = 'amp_ref_database'
         os.makedirs(database, exist_ok=True)
         db = database
         download_DRAMP(db)
-        return db
-    else:
-        if os.path.exists(database):
+        return db    
+    elif ((not database==None)):
+        if (os.path.exists(database)):
             db = database
+            print(f'<--AMP_database> = ${db} is found and will be used')
             return db
-        else:
-            if not os.path.exists(database):
-                sys.exit(f'Reference amp database path {database} does not exist, please check the path.')
+        if (not os.path.exists(database)):
+            sys.exit(f'Reference amp database path {database} does not exist, please check the path.')
+    elif((database==None) and (os.path.exists('amp_ref_database'))):
+        print('<--AMP_database> = DRAMP is already downloaded and will be reused')
+        database = 'amp_ref_database'
+        db = database
+        return db
 
 def check_path(path):
     return os.path.exists(path) #returns True or False

diff --git a/ampcombi/diamond_alignment.sh b/ampcombi/diamond_alignment.sh
@@ -7,18 +7,21 @@
 INPUT_FASTA=$1
 OUTPUT_DIR=$2
 REF_DIR=$3
+THREADS=$4
 
 read INPUT_FASTA
 read OUTPUT_DIR
 read REF_DIR
+read THREADS
 
-# Adjust path according to the input folder with the ist of fasta files
+# Adjust path according to the input folder with the list of fasta files
 IN=$INPUT_FASTA
 OUT=$OUTPUT_DIR 
 REF_DB=$REF_DIR
+P=$THREADS
 
 diamond blastp \
--p 28 -d $REF_DB/amp_ref -q $IN --quiet \
+-p $P -d $REF_DB/amp_ref -q $IN --quiet \
 --outfmt 6 qseqid sseqid pident evalue nident full_qseq full_sseq qseq sseq qcovhsp scovhsp --max-target-seqs 1 --ultra-sensitive -e10000 --id2 1 -s1 -c1 --masking 0 --gapped-filter-evalue 0 --algo 0 --min-score 0 --shape-mask 1111 \
 -o $OUT/diamond_matches.txt
 

diff --git a/ampcombi/diamond_makedb.sh b/ampcombi/diamond_makedb.sh
@@ -5,13 +5,16 @@
 #########################################
 
 INPUT_FASTA=$1
+THREADS=$2
 #OUTPUT_DIR=$2
 
 read INPUT_FASTA
+read THREADS
 #read OUTPUT_DIR
 
 IN=$INPUT_FASTA
+P=$THREADS
 #OUT=$OUTPUT_DIR 
 
 #cd $OUT
-diamond makedb --in $IN -p 28 -d amp_ref --quiet
+diamond makedb --in $IN -p $P -d amp_ref --quiet
diff --git a/ampcombi/print_header.py b/ampcombi/print_header.py
diff --git a/ampcombi/version.py b/ampcombi/version.py
@@ -1 +1 @@
-__version__ = '0.1.5'
+__version__ = '0.1.6'
diff --git a/ampcombi/visualise_complete_summary.py b/ampcombi/visualise_complete_summary.py
@@ -0,0 +1,11 @@
+#!/bin/python3
+
+# TITLE: Visualise teh complete summary and save it to a HTML file
+
+import subprocess
+
+########################################
+#  FUNCTION: GENERATE AN INTERACTIVE HTML SUMMARY 
+#########################################
+def html_generator():
+    subprocess.run('HTML.R', text=True)
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='AMPcombi',
-    version='0.1.5',
+    version='0.1.6',
     author='Anan Ibrahim, Louisa Perelo',
     author_email='ananhamido@hotmail.com, louperelo@gmail.com',
     packages=['ampcombi'],
@@ -17,7 +17,9 @@
              'ampcombi/diamond_makedb.sh',
              'ampcombi/reformat_tables.py',
              'ampcombi/print_header.py',
-             'ampcombi/version.py'],
+             'ampcombi/version.py',
+             'ampcombi/visualise_complete_summary.py',
+             'ampcombi/HTML.R'],
     url='http://pypi.python.org/pypi/AMPcombi/',
     license='LICENSE.txt',
     description='A parsing tool for AMP tools.',