Skip to content

Commit

Permalink
Merge branch 'devel' of https://github.com/gis-rpd/pipelines into devel
Browse files Browse the repository at this point in the history
  • Loading branch information
Andreas WILM committed Oct 3, 2017
2 parents 95fcd21 + c1c4779 commit 1aa81b7
Show file tree
Hide file tree
Showing 59 changed files with 1,026 additions and 137 deletions.
11 changes: 10 additions & 1 deletion bcl2fastq/bcl2fastq_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,14 @@ def run_qc_checks(project_dirs, machine_type):
if v == 0:
# no passed reads? no point in continuing checks
continue

# test: % PF clusters
v = values['% PF Clusters']
l = config['min-pf-clusters'][machine_type]
if v < float(l):
qcfails.append(
"Percent PF clusters under limit"
" ({} < {}) for lane {}".format(v, l, lane))

# test: perfect barcode
v = values['% Perfect barcode']
Expand All @@ -270,14 +278,15 @@ def run_qc_checks(project_dirs, machine_type):
"Percent Q30 bases under limit"
" ({} < {}) for lane {}".format(v, l, lane))

"""
# test: yield
v = values['Yield (Mbases)'] / 1000.0
l = config['min-lane-yield-gb'][machine_type]
if v < float(l):
qcfails.append(
"Yield under limit"
" ({} < {}) for lane {}".format(v, l, lane))

"""


# info about 'undetermined' sits elsewhere (only makes sense if demuxed)
Expand Down
Binary file modified bcl2fastq/example-dag.pdf
Binary file not shown.
41 changes: 41 additions & 0 deletions chromatin-profiling/atacseq/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# atacseq

## Summary

This pipeline calls signals of single-cell ATAC-seq data using MACS2.

The following steps are performed (modelled after
[Buenrostro et al., (2015)](https://www.ncbi.nlm.nih.gov/pubmed/26083756)):
- The input reads (assumed to be adapter trimmed) are mapped with
Bowtie2 and iscordant and unaligned reads are immediately dropped
- Duplicates are then marked with Sambamba
- Duplicated reads as well as those with low mapping quality (Q30) and
reads not mapping against autosomes (see `cfg/references.yaml`) are
removed
- Peaks are called with MACS2 (broad and narrow). Peaks summits are
converted to bed files (extended with `--peak-ext-bp`)-
- MACS2 is run with `--nomodel -f BAMPE --nolambda --call-summits` for
PE data and `--nomodel --shift -100 --extsize 200` for SE data (see
`--shift` and `--extsize` for SE specific MACS2 options)

## Input

- The input reads are assumed to be adapter trimmed
- Fragment length for Bowtie can be set with `--fragment-length`
- For default references see option `--references-cfg` and `cfg/references.yaml`

## Output

- Results appear per sample in `out/{sample}/`
- Filtered BAM file: `out/{sample}/{sample}.bowtie2.dedup.flt.bam`
- Mapping stats are named as bamfiles with `stats` attached
- Called peaks and their coordinates `out/{sample/macs2-{peaktype}/{sample}_peaks.xls`
- Extended peak summit location `out/{sample}/macs2-{peaktype}/{sample}_summits.bed`
- Tag density profile out/{sample}/macs2-{peaktype}/{sample}_treat_pileup.bw

where `{peaktype}` is 'narrow' or 'broad'

## References

- [Bowtie2 homepage](http://bowtie-bio.sf.net/bowtie2)
- [MACS2 homepage](https://github.com/taoliu/MACS)
210 changes: 210 additions & 0 deletions chromatin-profiling/atacseq/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
# standard library imports
#
import os

# third party imports
#
from snakemake.utils import report

# project specific imports
# /


LIB_PATH = os.path.abspath(
os.path.join(os.path.dirname(os.path.realpath(workflow.snakefile)), "..", "..", "lib"))
if LIB_PATH not in sys.path:
sys.path.insert(0, LIB_PATH)
from readunits import fastqs_from_unit_as_list

RESULT_OUTDIR = 'out'


# non-login bash
shell.executable("/bin/bash")
shell.prefix("source rc/snakemake_env.rc;")


include: "../../rules/logging.rules"
include: "../../rules/samtools.rules"
include: "../../rules/sambamba.rules"
include: "../../rules/report.rules"


localrules: final, report
# translate macs2 directory name (which indicates peak mode as macs2-{key}) to actual peak arg.
MACS2PEAK_TO_ARG = {
'narrow': "",
'broad': "--broad"
}


METHOD_PEAK_COMB = ["macs2-narrow", "macs2-broad"]
rule final:
input:
macs2_bed = expand(
os.path.join(RESULT_OUTDIR, "{sample}/{methodpeak}/{sample}_extbp_seq.bed"),
sample=config['samples'],
methodpeak=METHOD_PEAK_COMB),
bws = expand(
os.path.join(RESULT_OUTDIR, "{sample}/{methodpeak}/{sample}_treat_pileup.bw"),
sample=config['samples'],
methodpeak=METHOD_PEAK_COMB),
stats = expand(
os.path.join(RESULT_OUTDIR, '{sample}/{sample}.%s.dedup.flt.bamstats/stats.txt' % config['mapper']),
sample=config['samples']),
stats_proc = expand(
os.path.join(RESULT_OUTDIR, '{sample}/{sample}.%s.bamstats/stats.txt' % config['mapper']),
sample=config['samples']),


def bowtie_fastq_arg(wc):
"""Construct Bowtie Fastq args"""
fastq_arg = ""

# fastq list of lists
fastq_ll = [fastqs_from_unit_as_list(config['readunits'][unit])
for unit in config['samples'][wc.sample]]

assert len(set([len(x) for x in fastq_ll]))==1# making sure all PE or SE
if len(fastq_ll[0]) == 2:
sample_is_pe = True
else:
sample_is_pe = False

z = list(zip(*fastq_ll))
if sample_is_pe:
fastq_arg = "-1 {} -2 {}".format(
",".join(z[0]), ",".join(z[1]))
else:
fastq_arg = "-U {}".format(
",".join(z[0]))
return fastq_arg


def bowtie_fastq_in(wc):
"""helper function"""
return [x.strip() for x in bowtie_fastq_arg(wc)[3:].replace(" -2 ", ", ").split(",")]



rule bowtie:
input:
fastqs = bowtie_fastq_in,
bowtie2index = os.path.splitext(config['references']['genome'])[0] + ".1.bt2",# incomplete but should do
output:
bam = temp('{prefix}/{sample}.bowtie2.bam'),
log:
'{prefix}/{sample}.bowtie2.log'
params:
# mark_short_splits = MARK_SHORT_SPLITS,
bowtie2_custom_args = config.get("bowtie2_custom_args", ""),
sort_mem = '250M',
rg_id = lambda wc: wc.sample,
sample = lambda wc: wc.sample,
fragment_length = "-X {}".format(config['fragment_length']),
fastq_arg = bowtie_fastq_arg,
platform = config.get('platform', "Illumina"),
center = config.get('center', "GIS"),
threads:
8
shell:
"{{ bowtie2 -x $(echo {input.bowtie2index} | sed -e 's,.1.bt2,,') -t -p {threads}"
" --rg-id {params.rg_id} --rg 'PL:{params.platform}' --rg 'SM:{params.sample}' --rg 'CN:{params.center}'"
" {params.bowtie2_custom_args} {params.fastq_arg} {params.fragment_length} --no-mixed --no-discordant --no-unal |"
" samtools fixmate -O sam - - | "
" samtools sort -@ {threads} -m {params.sort_mem} -o {output.bam} -T {output.bam}.tmp -;"
" }} >& {log}"


rule bam_filter:
input:
bam = "{prefix}.bam",
bai = "{prefix}.bam.bai"
output:
bam = "{prefix}.flt.bam"
params:
min_mq = "30",
incl_chr = config['references']['incl_chroms']
threads:
2
shell:
"samtools view -@ {threads} -F 0x400 -b -q {params.min_mq} -o {output.bam} {input.bam} {params.incl_chr}"


rule macs2:
input:
bam = os.path.join(RESULT_OUTDIR, '{sample}/{sample}.%s.dedup.flt.bam' % config['mapper']),
output:
# all suffices predefined by macs2
peak_xls = '{outdir}/macs2-{peak}/{sample}_peaks.xls',
summits_bed = '{outdir}/macs2-{peak}/{sample}_summits.bed',
tbdg = temp('{outdir}/macs2-{peak}/{sample}_treat_pileup.bdg'),
log:
'{outdir}/macs2-{peak}.log'
params:
basename = lambda wc: wc.sample,
macs2_custom_args = config.get("macs2_custom_args", ""),
read_type_arg = '-f BAMPE --nolambda' if config['paired_end'] else "--shift {} --extsize {}".format(
config['shift'], config['extsize']),
peak_arg = lambda wc: MACS2PEAK_TO_ARG[wc.peak],
gsize = config["references"]["genomesize"],
call_summits_arg = lambda wc: "--call-summits" if wc.peak=='narrow' else ""
shell:
"{{ macs2 callpeak --treatment {input.bam}"
" --nomodel {params.call_summits_arg}"
" --gsize {params.gsize} --bdg {params.read_type_arg}"
" --keep-dup all"# we've marked dups already and want to avoid problems like https://github.com/taoliu/MACS/issues/78
" --outdir $(dirname {output.tbdg}) --name {params.basename}"
" {params.peak_arg} {params.macs2_custom_args} >& {log};"
# in default/narrow mode the bed file describes the 1bp summit
# which is $2+10 from narrowPeak. in --broad mode the bed file
# is missing (and so is $10 from the broadPeak file). so we
# create a bed-file, keeping the full peak range
#
#"test -e {output.summits_bed} || awk '{{d=$3-$2; mp=$2+int(d/2); printf \"%s\\t%d\\t%d\\t%s\\t%s\\n\", $1, mp, mp+1, $4, $9}}' $(ls $(dirname {output.tbdg})/*broadPeak) > {output.summits_bed};"
"test -e {output.summits_bed} || awk '{{printf \"%s\\t%d\\t%d\\t%s\\n\", $1, $2, $3, $9}}' $(ls $(dirname {output.tbdg})/*broadPeak) > {output.summits_bed}; }} >& {log}"


rule bdg_to_bw:
input:
bdg = "{prefix}.bdg",
genome_sizes = config['references']['genome'] + ".fai",
output:
bw = "{prefix}.bw",
log:
"{prefix}.bw.log"
message:
"Converting bedgraph to bigwig"
shell:
"{{ "
# convert bedgraph to bigwig
# getting 'bdg is not case-sensitive sorted at line 35. Please use "sort -k1,1 -k2,2n" with LC_COLLATE=C, or bedSort and try again.'
"tmp={input.bdg}.srt;"
# LC_COLLATE won't always work
"LC_ALL=C sort -k1,1 -k2,2n {input.bdg} > $tmp;"
"bedGraphToBigWig $tmp {input.genome_sizes} {output.bw};"
"rm -f $tmp;"
" }} >& {log}"

rule macs2_peak_to_fasta:
input:
summits_bed = '{rootdir}/macs2-{peak}/{prefix}_summits.bed',# crutch
genome = config['references']['genome'],
output:
seqbed = "{rootdir}/macs2-{peak}/{prefix}_extbp_seq.bed",
params:
# peak extended += ext_bp_arg i.e. 2*ext_bp_arg in total
# +- 250bp as in Ma et al. (2014), doi:10.1038/nprot.2014.083
peak_ext_bp = "{}".format(config["peak_ext_bp"]),
log:
"{rootdir}/macs2-{peak}/{prefix}_extbp.log"
message:
"Converting MACS2 peaks to fasta"
shell:
# see also comments on chipseq pipeline
"{{ "
"peaks=$(ls $(echo {input.summits_bed} | sed -e 's,_summits.bed,,')_peaks.[nb]*Peak);"
" awk -v ext={params.peak_ext_bp} '{{if (NF==9) {{d=$3-$2; mp=$2+int(d/2); c=$1; s=$2-ext; e=$2+ext;}} else if (NF==10) {{c=$1; s=$2+$10-ext; e=$2+$10+1+ext}} else {{exit 1}} printf \"%s\\t%d\\t%d\\t%s\\n\", c, s<0 ? 0 : s, e, $4;}}' $peaks > {output.seqbed};"
" }} >& {log}"


Loading

0 comments on commit 1aa81b7

Please sign in to comment.