config.yaml

#================================================================
# Basic Settings
#================================================================
experiment_name: 'ALLEGRO_EXAMPLE_RUN'
# ---

# ---------------------------------------------------------------
# Path Settings - Ignored if Easy Mode has a value
# ---------------------------------------------------------------

# Ignored if Easy Mode input_csv_path_with_guides above has a value
input_directory: 'data/input/example_input'
input_species_path: 'data/input/fifty_example_input_species.csv'
input_species_path_column: 'ortho_file_name'
# ---------------------------------------------------------------

# String value, default: 'track_e'
# track_a: any of the fasta records can be targeted. There will be multiplicity targets per fasta record.
# track_e: each fasta record has to be targeted at least multiplicity times,
track: 'track_e'
# ---

# Integer value, default: 1
# In track_a, each species needs to be targeted at least this many times ANYWHERE.
# In track_e, EACH gene/record needs to be targeted at least this many times in that gene/record.
multiplicity: 1
# ---

# Possible values: True or False. Default: True
# Remove guides with > gc_max and < gc_min from consideration?
filter_by_gc: False
gc_max: 0.7  # Only works if filter_by_gc is True.
gc_min: 0.3  # Only works if filter_by_gc is True.
# ---

# ---------------------------------------------------------------
# Easy Mode - Has priority over Path Settings
# ---------------------------------------------------------------
input_csv_path_with_guides: ''  # Default: ''

#================================================================
# Advanced Settings
# ===============================================================

# Possible choices: 'dummy' (default), 'ucrispr'
# scorer: 'ucrispr' uses a faster implementation of zhang2019
# dummy assigns a score of 1.0 to all gRNAs, essentially treats all guides as the same.
scorer: 'dummy'
# ---

# Integer value, default: 0
beta: 0  # The final size of the gRNAs set must be <= than this. Think of it as your budget.
          # Setting to 0 disables beta and causes ALLEGRO to find the smallest gRNA set IGNORING scores
          # (treats all of the gRNAs as equals).
          # If set to the number of input species, the final size of the set may be up to
          # the number of species you have (worst case, one gRNA per species).
          # If set to a number HIGHER than the number of species, finds the best #beta gRNAs.
# ---

# List of strings, Default: ['']
# ALLEGRO will output guides that do not contain any of the patterns in this list.
# Supports up to 5 chained IUPAC codes; e.g., 'RYSN'
# Exception to the 5 rule above is when positional nucleotides are used 
# in conjunction with 'N's. E.g., NNNNNNNCNNNNGNNNN will exclude guides
# with C and G in those positions.
# Supports individual nucleotides; e.g., 'TTTT' excludes guides with quad-T in their seq.
# Be careful not to place common nucleotides or IUPAC codes here such as just 'A' or 'AG'
# You may end up excluding most or all guides from the calculation.

# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3568203/#:~:text=For%20vertebrates%2C%20as,of%20these%20species.
patterns_to_exclude: ['TTTT']
# ---

# Boolean: True or False. Default: False
# Significantly affects running time.
# True reports gRNA with off-targets.
# report_up_to_n_mismatches reports gRNA with fewer <= N mismatches after the seed region.
output_offtargets: False
report_up_to_n_mismatches: 3  # This may be [0-3]

# The column in the input csv file with the name of the 
# background fasta to check off-targets against
input_species_offtarget_dir: 'data/input/example_input/'
input_species_offtarget_column: 'ortho_file_name'
# ---

# Boolean: True or False. Default: False
# Affects running time performance.
# Allows a guide within up to the set number of mismatches (after the seed region) of another guide
# to "inherit" the second guide's targets, essentially rendering the second guide useless
# and reducing the total guides needed.
# Works best when unscored guides are present as it does not consider scores.
# Uses seed_region_is_n_upstream_of_pam and mismatches_allowed_after_seed_region
preclustering: False

# Boolean: True or False. Default: False
# Affects running time performance.
# Compresses the output gRNA set by clustering similar gRNAs.
# Adds a new column called 'cluster' to output/EXPERIMENT_NAME/EXPERIMENT_NAME.csv
# Uses seed_region_is_n_upstream_of_pam and mismatches_allowed_after_seed_region
postclustering: False
seed_region_is_n_upstream_of_pam: 12
mismatches_allowed_after_seed_region: 3  # Integer value, default: 2

# Integer value, measured in seconds, default: 60
# Only used in solving the ILP if there are remaining feasible guides with
# fractional values after solving the LP.
# Stop searching for an optimal solution after this many seconds.
early_stopping_patience: 60
# ---

# Integer value. Default: 3
# A higher value sacrifices more running time for lower memory consumption.
# A preprocessing step that removes redundant guides.
# Use if you need to save memory.
# Max value is the total number of genes if using track E, and
# the total number of species if using track A.
mp_threshold: 0
# ---

# Boolean: True or False. Default: True
# When a problem is deemed unsolvable (e.g., Status: MPSOLVER_INFEASIBLE)
# Enabling diagnostics will attempt to relax each constraint and resolve the problem.
# If the new problem with the relaxed constraint is solvable, ALLEGRO outputs 
# the culprit gene/species.
# Currently, to stop this process, you need to find the PID of 
# the python process running ALLEGRO using: $ top
# and kill it manually: $ kill -SIGKILL PID
enable_solver_diagnostics: True
# ---