Merge branch 'devel' of https://github.com/gis-rpd/pipelines into devel

gis-rpd · Oct 3, 2017 · 1aa81b7 · 1aa81b7
2 parents 95fcd21 + c1c4779
commit 1aa81b7
Show file tree

Hide file tree

Showing 59 changed files with 1,026 additions and 137 deletions.
diff --git a/bcl2fastq/bcl2fastq_qc.py b/bcl2fastq/bcl2fastq_qc.py
@@ -252,6 +252,14 @@ def run_qc_checks(project_dirs, machine_type):
         if v == 0:
             # no passed reads? no point in continuing checks
             continue
+
+        # test: % PF clusters 
+        v = values['% PF Clusters']
+        l = config['min-pf-clusters'][machine_type]
+        if v < float(l):
+            qcfails.append(
+                "Percent PF clusters under limit"
+                " ({} < {}) for lane {}".format(v, l, lane))
 
         # test: perfect barcode
         v = values['% Perfect barcode']
@@ -270,14 +278,15 @@ def run_qc_checks(project_dirs, machine_type):
                 "Percent Q30 bases under limit"
                 " ({} < {}) for lane {}".format(v, l, lane))
 
+        """
         # test: yield
         v = values['Yield (Mbases)'] / 1000.0
         l = config['min-lane-yield-gb'][machine_type]
         if v < float(l):
             qcfails.append(
                 "Yield under limit"
                 " ({} < {}) for lane {}".format(v, l, lane))
-
+        """
 
 
     # info about 'undetermined' sits elsewhere (only makes sense if demuxed)

diff --git a/bcl2fastq/example-dag.pdf b/bcl2fastq/example-dag.pdf
diff --git a/chromatin-profiling/atacseq/README.md b/chromatin-profiling/atacseq/README.md
@@ -0,0 +1,41 @@
+# atacseq
+
+## Summary
+
+This pipeline calls signals of single-cell ATAC-seq data using MACS2.
+
+The following steps are performed (modelled after
+[Buenrostro et al., (2015)](https://www.ncbi.nlm.nih.gov/pubmed/26083756)):
+- The input reads (assumed to be adapter trimmed) are mapped with
+  Bowtie2 and iscordant and unaligned reads are immediately dropped
+- Duplicates are then marked with Sambamba
+- Duplicated reads as well as those with low mapping quality (Q30) and
+  reads not mapping against autosomes (see `cfg/references.yaml`) are
+  removed
+- Peaks are called with MACS2 (broad and narrow). Peaks summits are
+  converted to bed files (extended with `--peak-ext-bp`)-
+- MACS2 is run with `--nomodel -f BAMPE --nolambda --call-summits` for
+  PE data and `--nomodel --shift -100 --extsize 200` for SE data (see
+  `--shift` and `--extsize` for SE specific MACS2 options)
+
+## Input
+
+- The input reads are assumed to be adapter trimmed
+- Fragment length for Bowtie can be set with `--fragment-length`
+- For default references see option `--references-cfg` and `cfg/references.yaml`
+
+## Output
+
+- Results appear per sample in `out/{sample}/`
+- Filtered BAM file: `out/{sample}/{sample}.bowtie2.dedup.flt.bam`
+- Mapping stats are named as bamfiles with `stats` attached
+- Called peaks and their coordinates `out/{sample/macs2-{peaktype}/{sample}_peaks.xls`
+- Extended peak summit location `out/{sample}/macs2-{peaktype}/{sample}_summits.bed`
+- Tag density profile out/{sample}/macs2-{peaktype}/{sample}_treat_pileup.bw
+
+where `{peaktype}` is 'narrow' or 'broad'
+
+## References
+
+- [Bowtie2 homepage](http://bowtie-bio.sf.net/bowtie2)
+- [MACS2 homepage](https://github.com/taoliu/MACS)
diff --git a/chromatin-profiling/atacseq/Snakefile b/chromatin-profiling/atacseq/Snakefile
@@ -0,0 +1,210 @@
+# standard library imports
+#
+import os
+
+# third party imports
+#
+from snakemake.utils import report
+
+# project specific imports
+# /
+
+
+LIB_PATH = os.path.abspath(
+    os.path.join(os.path.dirname(os.path.realpath(workflow.snakefile)), "..", "..", "lib"))
+if LIB_PATH not in sys.path:
+    sys.path.insert(0, LIB_PATH)
+from readunits import fastqs_from_unit_as_list
+
+RESULT_OUTDIR = 'out'
+
+
+# non-login bash
+shell.executable("/bin/bash")
+shell.prefix("source rc/snakemake_env.rc;")
+
+
+include: "../../rules/logging.rules"
+include: "../../rules/samtools.rules"
+include: "../../rules/sambamba.rules"
+include: "../../rules/report.rules"
+
+
+localrules: final, report
+# translate macs2 directory name (which indicates peak mode as macs2-{key}) to actual peak arg.
+MACS2PEAK_TO_ARG = {
+    'narrow': "",
+    'broad': "--broad"
+}
+
+
+METHOD_PEAK_COMB = ["macs2-narrow", "macs2-broad"]
+rule final:
+    input:
+        macs2_bed = expand(
+            os.path.join(RESULT_OUTDIR, "{sample}/{methodpeak}/{sample}_extbp_seq.bed"),
+            sample=config['samples'],
+            methodpeak=METHOD_PEAK_COMB),
+        bws = expand(
+            os.path.join(RESULT_OUTDIR, "{sample}/{methodpeak}/{sample}_treat_pileup.bw"),
+            sample=config['samples'],
+            methodpeak=METHOD_PEAK_COMB),
+        stats = expand(
+            os.path.join(RESULT_OUTDIR, '{sample}/{sample}.%s.dedup.flt.bamstats/stats.txt' % config['mapper']),
+            sample=config['samples']),
+        stats_proc = expand(
+            os.path.join(RESULT_OUTDIR, '{sample}/{sample}.%s.bamstats/stats.txt' % config['mapper']),
+            sample=config['samples']),
+
+
+def bowtie_fastq_arg(wc):
+    """Construct Bowtie Fastq args"""
+    fastq_arg = ""
+
+    # fastq list of lists
+    fastq_ll = [fastqs_from_unit_as_list(config['readunits'][unit])
+                for unit in config['samples'][wc.sample]]
+
+    assert len(set([len(x) for x in fastq_ll]))==1# making sure all PE or SE
+    if len(fastq_ll[0]) == 2:
+        sample_is_pe = True
+    else:
+        sample_is_pe = False
+
+    z = list(zip(*fastq_ll))
+    if sample_is_pe:
+        fastq_arg = "-1 {} -2 {}".format(
+            ",".join(z[0]), ",".join(z[1]))
+    else:
+        fastq_arg = "-U {}".format(
+            ",".join(z[0]))
+    return fastq_arg
+
+
+def bowtie_fastq_in(wc):
+    """helper function"""
+    return [x.strip() for x in bowtie_fastq_arg(wc)[3:].replace(" -2 ", ", ").split(",")]
+
+
+
+rule bowtie:
+    input:
+        fastqs = bowtie_fastq_in, 
+        bowtie2index = os.path.splitext(config['references']['genome'])[0] + ".1.bt2",# incomplete but should do
+    output:
+        bam = temp('{prefix}/{sample}.bowtie2.bam'),
+    log:
+        '{prefix}/{sample}.bowtie2.log'
+    params:
+        # mark_short_splits = MARK_SHORT_SPLITS,
+        bowtie2_custom_args = config.get("bowtie2_custom_args", ""),
+        sort_mem = '250M',
+        rg_id = lambda wc: wc.sample,
+        sample = lambda wc: wc.sample,
+        fragment_length = "-X {}".format(config['fragment_length']),
+        fastq_arg = bowtie_fastq_arg,
+        platform = config.get('platform', "Illumina"),
+        center = config.get('center', "GIS"),
+    threads:
+        8
+    shell:
+        "{{ bowtie2 -x $(echo {input.bowtie2index} | sed -e 's,.1.bt2,,') -t -p {threads}"
+        " --rg-id {params.rg_id} --rg 'PL:{params.platform}' --rg 'SM:{params.sample}' --rg 'CN:{params.center}'"
+        " {params.bowtie2_custom_args} {params.fastq_arg} {params.fragment_length} --no-mixed --no-discordant --no-unal |"
+        " samtools fixmate -O sam - - | "
+        " samtools sort -@ {threads} -m {params.sort_mem} -o {output.bam} -T {output.bam}.tmp -;"
+        " }} >& {log}"
+
+
+rule bam_filter:
+    input:
+        bam = "{prefix}.bam",
+        bai = "{prefix}.bam.bai"
+    output:
+        bam = "{prefix}.flt.bam"
+    params:
+        min_mq = "30",
+        incl_chr = config['references']['incl_chroms']
+    threads:
+        2
+    shell:
+        "samtools view -@ {threads} -F 0x400 -b -q {params.min_mq} -o {output.bam} {input.bam} {params.incl_chr}"
+
+
+rule macs2:
+    input:
+        bam = os.path.join(RESULT_OUTDIR, '{sample}/{sample}.%s.dedup.flt.bam' % config['mapper']),
+    output:
+        # all suffices predefined by macs2
+        peak_xls = '{outdir}/macs2-{peak}/{sample}_peaks.xls',
+        summits_bed = '{outdir}/macs2-{peak}/{sample}_summits.bed',
+        tbdg = temp('{outdir}/macs2-{peak}/{sample}_treat_pileup.bdg'),
+    log:
+        '{outdir}/macs2-{peak}.log'
+    params:
+        basename = lambda wc: wc.sample,
+        macs2_custom_args = config.get("macs2_custom_args", ""),
+        read_type_arg = '-f BAMPE --nolambda' if config['paired_end'] else "--shift {} --extsize {}".format(
+            config['shift'], config['extsize']),
+        peak_arg = lambda wc: MACS2PEAK_TO_ARG[wc.peak],
+        gsize = config["references"]["genomesize"],
+        call_summits_arg = lambda wc: "--call-summits" if wc.peak=='narrow' else ""
+    shell:
+        "{{ macs2 callpeak --treatment {input.bam}"
+        " --nomodel {params.call_summits_arg}"
+        " --gsize {params.gsize} --bdg {params.read_type_arg}"
+        " --keep-dup all"# we've marked dups already and want to avoid problems like https://github.com/taoliu/MACS/issues/78
+        " --outdir $(dirname {output.tbdg}) --name {params.basename}"
+        " {params.peak_arg} {params.macs2_custom_args} >& {log};"
+        # in default/narrow mode the bed file describes the 1bp summit
+        # which is $2+10 from narrowPeak. in --broad mode the bed file
+        # is missing (and so is $10 from the broadPeak file). so we
+        # create a bed-file, keeping the full peak range
+        #
+        #"test -e {output.summits_bed} || awk '{{d=$3-$2; mp=$2+int(d/2); printf \"%s\\t%d\\t%d\\t%s\\t%s\\n\", $1, mp, mp+1, $4, $9}}' $(ls $(dirname {output.tbdg})/*broadPeak) > {output.summits_bed};"
+        "test -e {output.summits_bed} || awk '{{printf \"%s\\t%d\\t%d\\t%s\\n\", $1, $2, $3, $9}}' $(ls $(dirname {output.tbdg})/*broadPeak) > {output.summits_bed}; }} >& {log}"
+
+
+rule bdg_to_bw:
+    input:
+        bdg = "{prefix}.bdg",
+        genome_sizes = config['references']['genome'] + ".fai",
+    output:
+        bw = "{prefix}.bw",
+    log:
+        "{prefix}.bw.log"
+    message:
+        "Converting bedgraph to bigwig"
+    shell:
+        "{{ "
+        # convert bedgraph to bigwig
+        # getting 'bdg is not case-sensitive sorted at line 35.  Please use "sort -k1,1 -k2,2n" with LC_COLLATE=C,  or bedSort and try again.'
+        "tmp={input.bdg}.srt;"
+        # LC_COLLATE won't always work
+        "LC_ALL=C sort -k1,1 -k2,2n {input.bdg} > $tmp;"
+        "bedGraphToBigWig $tmp {input.genome_sizes} {output.bw};"
+        "rm -f $tmp;"
+        " }} >& {log}"
+
+rule macs2_peak_to_fasta:
+    input:
+        summits_bed = '{rootdir}/macs2-{peak}/{prefix}_summits.bed',# crutch
+        genome = config['references']['genome'],
+    output:
+        seqbed = "{rootdir}/macs2-{peak}/{prefix}_extbp_seq.bed",
+    params:
+        # peak extended += ext_bp_arg i.e. 2*ext_bp_arg in total
+        # +- 250bp as in Ma et al. (2014), doi:10.1038/nprot.2014.083
+        peak_ext_bp = "{}".format(config["peak_ext_bp"]),
+    log:
+        "{rootdir}/macs2-{peak}/{prefix}_extbp.log"
+    message:
+        "Converting MACS2 peaks to fasta"
+    shell:
+        # see also comments on chipseq pipeline
+        "{{ "
+        "peaks=$(ls $(echo {input.summits_bed} | sed -e 's,_summits.bed,,')_peaks.[nb]*Peak);"
+        " awk -v ext={params.peak_ext_bp} '{{if (NF==9) {{d=$3-$2; mp=$2+int(d/2); c=$1; s=$2-ext; e=$2+ext;}} else if (NF==10) {{c=$1; s=$2+$10-ext; e=$2+$10+1+ext}} else {{exit 1}} printf \"%s\\t%d\\t%d\\t%s\\n\", c, s<0 ? 0 : s, e, $4;}}' $peaks > {output.seqbed};"
+        " }} >& {log}"
+
+