project.yml

#########################################
# THIS IS A TYPICAL project.yml TEMPLATE
# most of the settings present here
# are GO for mapping production data
# but nonetheless user must consider
# carefully every presented option
#########################################

#########################################
# When commmenting parameters out, make sure
# that each section still has at least one
# uncommented parameter, otherwise it
# will not get properly parsed.
#########################################


#######################################
# provide paths to your raw input data (fastq-s):
#######################################
# Fastqs can be provided as:
# -- a pairs of relative/absolute paths
# -- sra:<SRA_NUMBER>, optionally followed by the indices of the first and
# the last entry in the SRA in the form of "?start=<first>&end=<last>
input:
    raw_reads_paths:
        # substitute the location of your fastq files:
        # an example of a 1-lane library:
        your_library_name_1:
            lane1:
                - /path/to/your/fastq/from/library1/library1_side1.fastq.gz
                - /path/to/your/fastq/from/library1/library1_side2.fastq.gz
        # an example of a 2-lane library:
        # mapped reads from both lanes will be pulled togheter
        # and deduplicated as one.
        your_library_name_2:
            lane1:
                - /path/to/your/fastq/from/library1/library2_lane1_side1.fastq.gz
                - /path/to/your/fastq/from/library1/library2_lane1_side2.fastq.gz
            lane2:
                - /path/to/your/fastq/from/library1/library2_lane2_side1.fastq.gz
                - /path/to/your/fastq/from/library1/library2_lane2_side2.fastq.gz
        # an example of a 3-lane library:
        # mapped reads from both lanes will be pulled togheter
        # and deduplicated as one.
        your_library_name_3:
            lane1:
                - /path/to/your/fastq/from/library1/library3_lane1_side1.fastq.gz
                - /path/to/your/fastq/from/library1/library3_lane1_side2.fastq.gz
            lane2:
                - /path/to/your/fastq/from/library1/library3_lane2_side1.fastq.gz
                - /path/to/your/fastq/from/library1/library3_lane2_side2.fastq.gz
            lane3:
                - /path/to/your/fastq/from/library1/library3_lane3_side1.fastq.gz
                - /path/to/your/fastq/from/library1/library3_lane3_side2.fastq.gz
        your_library_online:
            lane1:
                # Files can be provided as http/ftp/s3 links
                - https://github.com/open2c/distiller-test-data/raw/master/fastq/MATalpha_R1/lane2/SRR2601843_1.fastq.gz
                - https://github.com/open2c/distiller-test-data/raw/master/fastq/MATalpha_R1/lane2/SRR2601843_2.fastq.gz
            lane2:
                # Alternatively, reads can be downloaded from SRA
                - sra:SRR2601842
            lane3:
                # or like that:
                - sra:SRR2601843?start=0&end=10000


    # independent libraries can be combined together
    # on the level of binned-data (.cool files)
    # describe your groupings of choice here:
    library_groups:
        libraries_1_and_2:
            - your_library_name_1
            - your_library_name_2
        all_libraries:
            - your_library_name_1
            - your_library_name_2
            - your_library_name_3
            - your_library_online


    # Truncate input fastqs to a small number of reads (e.g. 10000) for
    # semi-dry test runs.
    # NOTE: when the inputs are specified as an SRA number, only this number of
    # reads is downloaded!
    truncate_fastq_reads: 0

    # Specify a reference genome to align sequenced reads.
    # Provide the genome assembly name, a wildcard path to the BWA index files
    # of the reference genome, and a tab-separated table with contig sizes
    # (known as "chrom.sizes"). The latter is used to specify the subset and the
    # order of contigs in a resulting contact map.
    genome:
        assembly_name: 'hg19'
        bwa_index_wildcard_path: '/path/to/your/reference/genome/hg19.fa.*'
        chrom_sizes_path:   '/path/to/your/reference/genome/hg19.reduced.chrom.sizes'

# Choose if you want to do FastQC of the input files:
do_fastqc: False

# Control how reads are mapped to the reference genomes.
map:
    # If 'chunksize' is non-zero, each input file gets split into multiple chunks,
    # each mapped separately. Useful for mapping on clusters with many
    # relatively weak nodes.
    # The optimal chunk size is defined by the balance between mapping and merging.
    # Smaller chunks (~30M) are better for clusters with many weak nodes,
    # however, having >~10 chunks per run slow down merging.
    chunksize: 0

    # Specify extra BWA mapping options.
    mapping_options: ''

    # Specify fastp trim options.
    #i.e. parameters
    #--detect_adapter_for_pe -q 15
    trim_options: ''

    # A more technical option, use a custom script to split fastq files from SRA 
    # into two files, one per read side. By default it is true, which is
    # faster (because we can use multi-threaded compression), but less
    # stable. Set to false if you download files from SRA and bwa complains
    # about unpaired reads.
    use_custom_split: true

# Control how read alignments are converted ('parsed') into Hi-C pairs.
parse:
    # If 'make_pairsam' is True, parsed Hi-C pairs will store complete
    # alignment records in the SAM format (the resulting hybrid between the
    # .pairs and .sam formats is called '.pairsam'). Such files can be useful for
    # thorough investigation of Hi-C data. Downstream of parsing, pairsams
    # are split into .pairs and .bam, and .bam alignments are tagged with
    # Hi-C related information. 'make_pairsam' roughly doubles the storage
    # and I/O requirements and should be used only when absolutely needed.
    # NOTE: when 'make_pairsam' is False, the initial output of parsing is still
    # called '.pairsam' despite missing SAM alignments, for technical reasons.
    make_pairsam: False

    # When 'make_pairsam' is True, enabling 'drop_seq' erases sequences and
    # Phred scores from the SAM alignments in .pairsam and .bam output files.
    # Enable to make lightweight .pairsam/.bam output.
    # NOTE: when 'make_pairsam' is False, 'drop_seq' is ignored.
    drop_seq: False

    # Enable 'drop_readid' to drop readID from .pairs files to create
    # lightweight .pairs files
    # NOTE: does not affect alignment records in the .pairsam files and
    # subsequently .bam files after .apirsam splitting.
    drop_readid: True

    # When 'keep_unparsed_bams' is True, distiller preserves the _immediate_
    # output of bwa in a .bam format. Could be used as a faster alternative
    # to 'make_pairsam' when alignments are needed, but tagging them with Hi-C
    # related information is not necessary.
    keep_unparsed_bams: False

    # Pass extra options to pairtools parse, on top of the ones specified by
    # flags 'make_pairsam', 'drop_readid', 'drop_seq'. The default value
    # enables storing MAPQ scores in the .pairsam/.pairs output, which are
    # used later for filtering/binning.
    parsing_options: '--add-columns mapq'

# Control how PCR/optical duplicates are detected in the data.
dedup:
    # PCR/optical duplicates are detected as Hi-C pairs with matching locations
    # on both sides. 'max_mismatch_bp' controls the maximal allowed mismatch in
    # mapped locations on either side for two pairs to be still considered as
    # duplicates.
    max_mismatch_bp: 1

# Control how Hi-C pairs are binned into contact maps, stored in .cool files.
bin:
    # Specify which resolutions should be included in the multi-resolution .cool file.
    # The lowest (base) resolution _must_ be the common denominator of all other
    # resolutions.
    resolutions:
        - 10000000
        - 5000000
        - 2500000
        - 1000000
        - 500000
        - 250000
        - 100000
        - 50000
        - 25000
        - 10000
        - 5000
        - 2000
        - 1000
    # Specify if the multi-resolution .cool output files should be balanced.
    balance: true

    # Pass additional parameters to cooler balance:
    # balance_options: ''

    # Specify additional filters applied to pairs during binning.
    # Multiple filters are allowed; for each filter, all pairs satisfying the
    # given filter expression will be binned into a separate cooler.
    # Filters are specified using the following syntax:
    # {filter_name}: '{a valid Python expression}'
    filters:
        no_filter: ''
        mapq_30: '(mapq1>=30) and (mapq2>=30)'

########################################
# folder structure for storing results
########################################
output:
    dirs:
        processed_fastqs: 'results/processed_fastqs/'
        mapped_parsed_sorted_chunks: 'results/mapped_parsed_sorted_chunks'
        fastqc: 'results/fastqc/'
        pairs_library: 'results/pairs_library'
        coolers_library: 'results/coolers_library/'
        coolers_library_group: 'results/coolers_library_group/'
        stats_library_group: 'results/stats_library_group/'