workflows/metagenomics-wf/config.yaml

###################################################################################################
## Config file for the "bit" assembly-based metagenomics processing workflow                     ##
## bit: https://github.com/AstrobioMike/bit                                                      ##
##                                                                                               ##
## If you use this workflow in a publication, please consider citing :)                          ##
##   Lee M. bit: a multipurpose collection of bioinformatics tools. F1000Research 2022, 11:122.  ##
##   https://doi.org/10.12688/f1000research.79530.1                                              ##
###################################################################################################


############################################################
##################### VARIABLES TO SET #####################
############################################################

############################################################################
##### This first set of variables needs to match what is on our system #####
############################################################################

## single-column file with unique portion of sample names
sample_info_file:
    "unique-sample-IDs.txt"

## input reads directory (can be relative to workflow directory, or needs to be full path)
input_reads_dir:
    "test-metagenomics-reads/"

## if data are single-end only (only one read-file per sample), set this to "TRUE", anything else is considered paired-end
single_end_data:
    ""

## input read suffixes (region following the unique part of the sample names)
    # e.g. for "Sample-1_R1.fastq.gz" would be "_R1.fastq.gz"
input_R1_suffix:
    "_R1.fastq.gz"
input_R2_suffix:
    "_R2.fastq.gz"

    # if single-end data, set this one (others above don't matter)
input_suffix:
    ".fastq.gz"

## root directory of reference databases (or where they will be downloaded if they don't exist yet)
    # this should be provided as a full path (starting with `/`) and include the ending `/` as in the
    # below example (note that the the `~/` home shortcut is not expanded
    # by snakemake's evaluation of files, so don't use that)
REF_DB_ROOT_DIR:
    "/opt/shared/mg-refs/"

######################################################################
##### The rest only need to be altered if we want to change them #####
######################################################################

## perform binning and MAG recovery? "TRUE" for yes, anything else means the workflow won't do any binning, MAG recovery, or characterization of bins/MAGs
perform_binning_and_MAG_recovery:
    "TRUE"

## number of threads to use PER snakemake job (which is set with the -j parameter passed to snakemake call)
    # passed to megahit, bowtie2, samtools, metabat2, checkm-pplacer (many may be running concurrently)
num_threads:
    8

## number of CPUs to use PER snakemake job
    # passed to KOFamScan, CAT, checkm (many may be running concurrently)
num_cpus:
    8

## number of cpus passed to pplacer by gtdb-tk, pplacer can have issues with memory with multiple cpus; see e.g. https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes
gtdb_tk_pplacer_cpus:
    1

## number of CPUs to use for gtdb-tk (only 1 gtdb-tk job will be run, so not multiplied)
gtdb_tk_num_cpus:
    8

## scratch directory for gtdb-tk, if wanting to use disk space instead of RAM, can be memory intensive; see https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes
    # leave empty if wanting to use memory, the default, put in quotes the path to a directory that already exists if wanting to use disk space
gtdb_tk_scratch_location:
    ""

## maximum memory allowed passed to megahit assembler
    # can be set either by proportion of available on system, e.g. 0.5
    # or by absolute value in bytes, e.g. 100e9 would be 100 GB
max_mem_megahit:
    100e9

## Block size variable for CAT/diamond, lower value means less RAM usage; see https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options#memory--performance-options
block_size:
    4

## reduced_tree option for checkm, limits the RAM usage to 16GB; https://github.com/Ecogenomics/CheckM/wiki/Genome-Quality-Commands#tree
    # "TRUE" for yes, anything will be considered "FALSE" and the default full tree will be used
reduced_tree:
    ""

## MAG filtering cutoffs based on checkm quality assessments (in percent); see https://github.com/Ecogenomics/CheckM/wiki/Reported-Statistics
minimum_estimated_completion:
    90
maximum_estimated_redundancy:
    10

## quality trimmed/filtered suffixes
filtered_R1_suffix:
    "_R1_filtered.fastq.gz"
filtered_R2_suffix:
    "_R2_filtered.fastq.gz"

# if single-end
filtered_suffix:
    "_filtered.fastq.gz"

## primary workflow output directory (relative to processing directory; will be created if needed; should include the trailing "/")
outputs_dir:
    "workflow-outputs/"

## additional prefix to add to output files that describe more than one sample (to make them unique compared to other datasets)
# leave as empty, i.e. "", if not wanted, include separator at end if adding one, e.g. "Swift1S_"
additional_filename_prefix:
    ""

## setting for trimming recommended when working with Swift 1S libraries
    # adds `swift=t` setting to bbduk quality trimming/filtering command
    # for info on this see, e.g., https://swiftbiosci.com/wp-content/uploads/2019/03/16-0853-Tail-Trim-Final-442019.pdf
    # set to "TRUE" if data was generated with Swift 1S library prep
swift_1S:
    "FALSE"

## memory used by bbmap's pileup.sh (within the get_cov_and_det rule)
# passed as the -Xmx parameter, 20g means 20 gigs of RAM, 20m means 20 megabytes
# 5g should be sufficient for most assemblies, but if that rule is failing, this may need to be increased
pileup_mem:
    "5g"

################################################################################################################
##### Resource specifications that may need to be changed (mostly only necessary if using a job scheduler) #####
####### Could leave these as-is to start, but they are here to be increased if a job fails due to memory #######
################################################################################################################

### these are all passed in the "resources" directive of their respective rules in the Snakefile, going to
    # the "mem_mb" argument (so should be provided in terms of megabytes)

# passed to megahit in the assembly_PE and assembly_SE rules
    # this should match what is passed to "max_mem_megahit" above, though it needs to be written differently
    # this is passed as "mem_mb", so 100000 would be equal to the default 100e9 set above for "max_mem_megahit"
megahit_memory_resources:
    100000

# passed to pileup.sh within the get_cov_and_det rule
    # should match what is passed to "pileup_mem" above, though needs to be written differently
    # this is passed as "mem_mb", e.g., 5g above, for 5 gigabytes, would be 5000 megabytes, so we need to set this variable to 5000
pileup_memory_resources:
    5000

# passed to mapping_SE and mapping_PE rules, passed as "mem_mb", so 25000 here means 25 gigabytes of memory will be allocated by the scheduler
mapping_memory_resources:
    25000

# passed to rule gtdbtk_on_MAGs
gtdbtk_memory_resources:
    500000

# passed to rule checkm_on_bins:
checkm_memory_resources:
    250000

# passed to CAT in tax_classification rule:
CAT_memory_resources:
    40000

# passed to KOFamScan in rule KO_annotation
KOFamScan_memory_resources:
    5000


#######################################################
################# REFERENCE DATABASES #################
#######################################################
# The below variables probably shouldn't be changed unless we really want to for some reason.
# The workflow will check the location pointed to above for the below databases, and install them
# if they are not already there. It looks for the below "TRIGGER" filenames (they 
# all end with "*_DB_SETUP") in the directory for each database, which it creates when
# it sets them up initially. If we want to point to DBs that already exist on our setup,
# we need to add these (empty) files to their respective directories. The
# workflow just checks the file is there to know it doesn't need to setup the DB.
#
# All together, after installed and unpacked, these will take up about 240 GB. But may 
# require up to 500 GB during installation and initial un-packing. 

## specific database locations
KOFAMSCAN_DIR:
    "kofamscan_db"
KOFAMSCAN_TRIGGER_FILE:
    "KO_DB_SETUP"
CAT_DIR:
    "CAT_prepare_20210107"
CAT_DL_FILE:
    "CAT_prepare_20210107.tar.gz"
CAT_DL_LINK:
    "tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz"
CAT_NR_GZ:
    "2021-01-07.nr.gz"
CAT_TRIGGER_FILE:
    "CAT_DB_SETUP"
CAT_DB:
    "2021-01-07_CAT_database"
CAT_TAX:
    "2021-01-07_taxonomy"
GTDB_DATA_PATH:
    "GTDB-tk-ref-db"
GTDB_MASH_REF_PATH:
    "gtdb.msh"
GTDB_TRIGGER_FILE:
    "GTDBTK_DB_SETUP"
CHECKM2_DATA_PATH:
    "checkm2-ref-dir"
CHECKM2_DB_FILENAME:
    "uniref100.KO.1.dmnd"
CHECKM2_TRIGGER_FILE:
    "CHECKM2_DB_SETUP"

## example usage command ##
# snakemake --use-conda --conda-prefix ${CONDA_PREFIX}/envs -j 2 -p

# `--use-conda` – this specifies to use the conda environments included in the workflow
# `--conda-prefix` – this allows us to point to where the needed conda environments should be stored. Including this means if we use the workflow on a different dataset somewhere else in the future, it will re-use the same conda environments rather than make new ones. The value listed here, `${CONDA_PREFIX}/envs`, is the default location for conda environments (the variable `${CONDA_PREFIX}` will be expanded to the appropriate location on whichever system it is run on).
# `-j` – this lets us set how many jobs Snakemake should run concurrently (keep in mind that many of the thread and cpu parameters set in the config.yaml file will be multiplied by this)
# `-p` – specifies to print out each command being run to the screen

# See `snakemake -h` for more options and details.