config/config.yaml


# Used for parallelization by snakemake. This setting is based on the architecture of the metal-as-as-service 
# node netdc-bms-c11g.maas - which has 56 cores. Adjust this setting as needed. For example, on most laptops
# a reasonable value might be 4, or 8.
cpu_cores: 56

# Used for scatter/gather processing, corresponds with the number of families within the taxon defined in 
# fasta_filter that have records for the specified marker. In practice, this means that the input BCDM TSV
# file has to have exactly this many distinct (not empty) values for the `family` column where the column
# named under fasta_filter.filter_level has has value fasta_filter.filter_name (e.g. the `order` must be
# `Odonata`. TODO: make this so that it is done automatically from the data. At present, this needs to be 
# calculated by the user from the input file, e.g. by first making the sqlite database, or by grepping the 
# TSV file somehow.
nfamilies: 36

# Number of outgroups to include in each family-level analysis. Minimum is 2.
outgroups: 3

# Used for verbosity, see `logging` from stdlib
log_level: 'INFO'

# Either use COI-5P or matK_rbcL
marker: COI-5P

# Substitution model in RAxML CLI syntax
model: GTR+G

# Minimum length to include (this fits COI-5P)
minlength: 600

# Minimum number of sequences per alignment
minseq: 3

# Which exemplars to pick: tallest, shortest, or median
exemplars: tallest

# How to rescale the subtrees: exemplars (i.e. subtree depth ratio) or outgroup (crown-to-crown distance ratio)
scaling: exemplars

# Maximum number of sequences per alignment
maxseq: 12000

# Nucleotide alignments (NT) or aminoacid (AA) for MACSE alignment
datatype: NT

# Choose which records to use from the database for the pipeline. filter_name only takes one name, so does filter level.
# filter levels: class, order, family, genus, all (no filter)
fasta_filter:
  filter_level: order
  filter_name: Odonata

name: phylogeny

blastdb: blastdb

dependencies:
  - pip:
      - -r requirements.txt

file_names:
  bold_tsv: resources/BOLD_Public.05-Apr-2024-curated.tsv
  open_tre: resources/opentree14.9_tree/labelled_supertree/labelled_supertree.tre
  hmm: resources/hmm/COI-5P.hmm
  fasta_dir: results/fasta/family
  blast_dir: results/blast