diff --git a/.editorconfig b/.editorconfig deleted file mode 100644 index 5573fb6..0000000 --- a/.editorconfig +++ /dev/null @@ -1,15 +0,0 @@ -# EditorConfig is awesome: http://EditorConfig.org - -# top-most EditorConfig file -root = true - -[*] -end_of_line = lf -insert_final_newline = true -charset = utf-8 -indent_style = space -indent_size = 4 - -[*.{yml,yaml}] -indent_style = space -indent_size = 2 diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index 9440e44..0000000 --- a/.gitattributes +++ /dev/null @@ -1,2 +0,0 @@ -*.smk linguist-language=Python -*.snakefile linguist-language=Python diff --git a/.gitignore b/.gitignore deleted file mode 100644 index fdb60f6..0000000 --- a/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -**/.koparde* -!.editorconfig -!.gitignore -!.gitattributes -site/ diff --git a/LICENSE b/LICENSE deleted file mode 100644 index c5caeb8..0000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2021 Vishal Koparde - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/README.md b/README.md deleted file mode 100644 index fcd525e..0000000 --- a/README.md +++ /dev/null @@ -1,18 +0,0 @@ -![GitHub issues](https://img.shields.io/github/issues/CCBR/CCBR_circRNA_AmpliconSeq)![forks](https://img.shields.io/github/forks/CCBR/CCBR_circRNA_AmpliconSeq)![stars](https://img.shields.io/github/stars/CCBR/CCBR_circRNA_AmpliconSeq)![LICENSE](https://img.shields.io/github/license/CCBR/CCBR_circRNA_AmpliconSeq) - -# CCBR circRNA AmpliconSeq Snakemake Workflow -This is a snakemake workflow to process circRNA AmpliconSeq datasets generated to study circRNAs in KSHV and human hosts using divergent primers. Some basic usage instructions are as follows: - -``` -USAGE: - bash ./run_circRNA_AmpliconSeq -m/--runmode= -w/--workdir= -Required Arguments: -1. RUNMODE: [Type: String] Valid options: - *) init : initialize workdir - *) run : run with slurm - *) reset : DELETE workdir dir and re-init it - *) dryrun : dry run snakemake to generate DAG - *) unlock : unlock workdir if locked by snakemake - *) runlocal : run without submitting to sbatch -2. WORKDIR: [Type: String]: Absolute or relative path to the output folder with write permissions. -``` diff --git a/config/cluster.json b/config/cluster.json deleted file mode 100644 index 9307f82..0000000 --- a/config/cluster.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "__default__": { - "gres": "lscratch:256", - "mem": "20g", - "partition": "ccr", "qos": "ccrcore", - "threads": "2", - "time": "4:00:00", - "name" : "{rule}.{wildcards}", - "output" : "logs/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.out", - "error" : "logs/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.err" - }, - "align":{ - "mem": "120g", - "threads": "56", - "time": "48:00:00" - }, - "cutadapt":{ - "mem": "120g", - "threads": "56", - "time": "48:00:00" - } -} diff --git a/config/config.yaml b/config/config.yaml deleted file mode 100644 index bd2cb79..0000000 --- a/config/config.yaml +++ /dev/null @@ -1,68 +0,0 @@ -## you probably need to change or comment/uncomment some of these -# -# The working dir... output will be in the results subfolder of the workdir -workdir: "WORKDIR" -# -# tab delimited samples file ... should have the following 3 columns -# sampleName path_to_R1_fastq path_to_R2_fastq -# -samples: "WORKDIR/samples.tsv" -# -# BSJ fasta file -# bsjfa: "/data/Ziegelbauer_lab/Sarah/MiSeq_R2/bsjs_020921.fasta" -# -# divergent primers information TSV file which will be converted to BED6 format -# The columns in this files are (see example at the bottom): -# Primer Chromosome Start End Strand PrimerSequence_5prime_to_3prime Reference -primertsv: "PIPELINE_HOME/resources/primers.tsv" -# primerbed: "PIPELINE_HOME/resources/primers.bed" -reffa: "/data/Ziegelbauer_lab/resources/hg38_rRNA_masked_plus_rRNA_plus_viruses_plus_ERCC.v2/hg38_rRNA_masked_plus_rRNA_plus_viruses_plus_ERCC.fa" -# -bowtie2_alignment_parameters: "--very-sensitive-local" -salmon_parameters: "-l U --maxReadOcc 3 --minAssignedFrags 60" -aggregate_quant_rowsum_filter: 50 # aggregate quant.sf across samples and filter out rows <= this filter value -# -# -# -# -# -# -## you most probably dont need to change these -scriptsdir: "PIPELINE_HOME/workflow/scripts" -resourcesdir: "PIPELINE_HOME/resources" -tools: "PIPELINE_HOME/config/tools.yaml" -cluster: "PIPELINE_HOME/config/cluster.json" -adapters: "PIPELINE_HOME/resources/TruSeq_and_nextera_adapters.consolidated.fa" -# bsjfadefault: "PIPELINE_HOME/resources/bsjs_020921.fasta.gz" -# -## Resources -# -# hg38_rRNA_masked_plus_rRNA_plus_viruses_plus_ERCC -# -# ref_fa: "/data/Ziegelbauer_lab/resources/hg38_rRNA_masked_plus_rRNA_plus_viruses_plus_ERCC.v2/hg38_rRNA_masked_plus_rRNA_plus_viruses_plus_ERCC.fa" -# ref_gtf: "/data/Ziegelbauer_lab/resources/hg38_rRNA_masked_plus_rRNA_plus_viruses_plus_ERCC.v2/hg38_rRNA_masked_plus_rRNA_plus_viruses_plus_ERCC.gtf" -# regions: "/data/Ziegelbauer_lab/resources/hg38_rRNA_masked_plus_rRNA_plus_viruses_plus_ERCC.v2/hg38_rRNA_masked_plus_rRNA_plus_viruses_plus_ERCC.fa.regions" -# star_index_dir: "/data/Ziegelbauer_lab/resources/hg38_rRNA_masked_plus_rRNA_plus_viruses_plus_ERCC.v2/STAR_index_no_GTF_2.7.6a" -# ref_bwa_index: "/data/Ziegelbauer_lab/resources/hg38_rRNA_masked_plus_rRNA_plus_viruses_plus_ERCC.v2/hg38_rRNA_masked_plus_rRNA_plus_viruses_plus_ERCC" -# ref_hisat_index: "/data/Ziegelbauer_lab/resources/hg38_rRNA_masked_plus_rRNA_plus_viruses_plus_ERCC.v2/hg38_rRNA_masked_plus_rRNA_plus_viruses_plus_ERCC" -# ref_bowtie1_index: "/data/Ziegelbauer_lab/resources/hg38_rRNA_masked_plus_rRNA_plus_viruses_plus_ERCC.v2/hg38_rRNA_masked_plus_rRNA_plus_viruses_plus_ERCC" -# genepred_w_geneid: "/data/Ziegelbauer_lab/resources/hg38_rRNA_masked_plus_rRNA_plus_viruses_plus_ERCC.v2/hg38_rRNA_masked_plus_rRNA_plus_viruses_plus_ERCC.genes.genepred_w_geneid" -# - -# Primer Chromosome Start End Strand PrimerSequence_5prime_to_3prime Reference -# circ29_F NC_009333.1 29751 29770 + ACCAGACGGCAAGGTTTTTA https://doi.org/10.1073/pnas.1816183115 -# circ29_R NC_009333.1 29641 29663 - TCGTTAGTCAACCTAGCAAAACA https://doi.org/10.1073/pnas.1816183115 -# circ57_F NC_009333.1 57271 57290 + CAACCAAAAGGCAGAGTCGT https://doi.org/10.1073/pnas.1816183115 -# circ57_R NC_009333.1 57057 57076 - GGCTGAACCCAAGAACTTCA https://doi.org/10.1073/pnas.1816183115 -# circvIRF4_F NC_009333.1 88288 88307 - CTCCGTGTGGATACCAGTGA 10.1128/JVI.01952-18 -# circvIRF4_R NC_009333.1 88746 88764 + TGGTTCCACGCAACAGTCT 10.1128/JVI.01952-18 -# circ97_F NC_009333.1 98176 98195 - GGAAGAAGCTCATGGACTGG https://doi.org/10.1073/pnas.1816183115 -# circ97_R NC_009333.1 97964 97983 + GACCTAAAAACCCGGAGGAG https://doi.org/10.1073/pnas.1816183115 -# circT0.7_F3 NC_009333.1 118221 118240 + AGTGAGGAGGGAGGAGGGCA -# circT0.7_R2 NC_009333.1 117701 117722 - CGCTCTCCCAAACCACACGAAT -# circORF72_F3 NC_009333.1 123582 123601 + AAGATTAAGGGCCAACGCGA -# circORF72_R1 NC_009333.1 123138 123157 - TAGTTCCTCAGCTGGCAAGC -# circ1400_F chr4 37631449 37631468 - ATGTCTGTTAGTGGGGCTGA https://doi.org/10.1073/pnas.1816183115 -# circ1400_R chr4 37638469 37638488 + TATCTGCTACCATCGCCTTT https://doi.org/10.1073/pnas.1816183115 -# circ1741_F chr7 129015044 129015063 - TCGTTCCTTACGAATTGGAG https://doi.org/10.1073/pnas.1816183115 -# circ1741_R chr7 129018113 129018132 + CTGCCGGATCTGTAACAACT https://doi.org/10.1073/pnas.1816183115 \ No newline at end of file diff --git a/config/samples.1.tsv b/config/samples.1.tsv deleted file mode 100644 index e8580f4..0000000 --- a/config/samples.1.tsv +++ /dev/null @@ -1,25 +0,0 @@ -sampleName path_to_R1_fastq path_to_R2_fastq -iSLK_KO_c1400 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_KO_c1400/iSLK_KO_c1400_S15_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_KO_c1400/iSLK_KO_c1400_S15_R2_001.fastq.gz -iSLK_KO_c1741 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_KO_c1741/iSLK_KO_c1741_S16_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_KO_c1741/iSLK_KO_c1741_S16_R2_001.fastq.gz -iSLK_KO_kcirc29 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_KO_kcirc29/iSLK_KO_kcirc29_S9_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_KO_kcirc29/iSLK_KO_kcirc29_S9_R2_001.fastq.gz -iSLK_KO_kcirc57 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_KO_kcirc57/iSLK_KO_kcirc57_S10_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_KO_kcirc57/iSLK_KO_kcirc57_S10_R2_001.fastq.gz -iSLK_KO_kcirc97 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_KO_kcirc97/iSLK_KO_kcirc97_S11_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_KO_kcirc97/iSLK_KO_kcirc97_S11_R2_001.fastq.gz -iSLK_KO_ORF72 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_KO_ORF72/iSLK_KO_ORF72_S13_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_KO_ORF72/iSLK_KO_ORF72_S13_R2_001.fastq.gz -iSLK_KO_T0_7 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_KO_T0_7/iSLK_KO_T0_7_S14_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_KO_T0_7/iSLK_KO_T0_7_S14_R2_001.fastq.gz -iSLK_KO_vIRF4 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_KO_vIRF4/iSLK_KO_vIRF4_S12_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_KO_vIRF4/iSLK_KO_vIRF4_S12_R2_001.fastq.gz -iSLK_WT_c1400 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_WT_c1400/iSLK_WT_c1400_S7_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_WT_c1400/iSLK_WT_c1400_S7_R2_001.fastq.gz -iSLK_WT_c1741 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_WT_c1741/iSLK_WT_c1741_S8_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_WT_c1741/iSLK_WT_c1741_S8_R2_001.fastq.gz -iSLK_WT_kcirc29 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_WT_kcirc29/iSLK_WT_kcirc29_S1_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_WT_kcirc29/iSLK_WT_kcirc29_S1_R2_001.fastq.gz -iSLK_WT_kcirc57 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_WT_kcirc57/iSLK_WT_kcirc57_S2_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_WT_kcirc57/iSLK_WT_kcirc57_S2_R2_001.fastq.gz -iSLK_WT_kcirc97 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_WT_kcirc97/iSLK_WT_kcirc97_S3_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_WT_kcirc97/iSLK_WT_kcirc97_S3_R2_001.fastq.gz -iSLK_WT_ORF72 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_WT_ORF72/iSLK_WT_ORF72_S5_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_WT_ORF72/iSLK_WT_ORF72_S5_R2_001.fastq.gz -iSLK_WT_T0_7 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_WT_T0_7/iSLK_WT_T0_7_S6_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_WT_T0_7/iSLK_WT_T0_7_S6_R2_001.fastq.gz -iSLK_WT_vIRF4 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_WT_vIRF4/iSLK_WT_vIRF4_S4_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_WT_vIRF4/iSLK_WT_vIRF4_S4_R2_001.fastq.gz -LEC_BAC16_c1400 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_LEC_BAC16_c1400/LEC_BAC16_c1400_S23_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_LEC_BAC16_c1400/LEC_BAC16_c1400_S23_R2_001.fastq.gz -LEC_BAC16_c1741 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_LEC_BAC16_c1741/LEC_BAC16_c1741_S24_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_LEC_BAC16_c1741/LEC_BAC16_c1741_S24_R2_001.fastq.gz -LEC_BAC16_kcirc29 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_LEC_BAC16_kcirc29/LEC_BAC16_kcirc29_S17_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_LEC_BAC16_kcirc29/LEC_BAC16_kcirc29_S17_R2_001.fastq.gz -LEC_BAC16_kcirc57 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_LEC_BAC16_kcirc57/LEC_BAC16_kcirc57_S18_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_LEC_BAC16_kcirc57/LEC_BAC16_kcirc57_S18_R2_001.fastq.gz -LEC_BAC16_kcirc97 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_LEC_BAC16_kcirc97/LEC_BAC16_kcirc97_S19_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_LEC_BAC16_kcirc97/LEC_BAC16_kcirc97_S19_R2_001.fastq.gz -LEC_BAC16_ORF72 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_LEC_BAC16_ORF72/LEC_BAC16_ORF72_S21_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_LEC_BAC16_ORF72/LEC_BAC16_ORF72_S21_R2_001.fastq.gz -LEC_BAC16_T0_7 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_LEC_BAC16_T0_7/LEC_BAC16_T0_7_S22_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_LEC_BAC16_T0_7/LEC_BAC16_T0_7_S22_R2_001.fastq.gz -LEC_BAC16_vIRF4 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_LEC_BAC16_vIRF4/LEC_BAC16_vIRF4_S20_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_LEC_BAC16_vIRF4/LEC_BAC16_vIRF4_S20_R2_001.fastq.gz diff --git a/config/samples.tsv b/config/samples.tsv deleted file mode 100644 index 2bf83b7..0000000 --- a/config/samples.tsv +++ /dev/null @@ -1,2 +0,0 @@ -sampleName path_to_R1_fastq path_to_R2_fastq -iSLK_KO_c1741 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_KO_c1741/iSLK_KO_c1741_S16_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_KO_c1741/iSLK_KO_c1741_S16_R2_001.fastq.gz \ No newline at end of file diff --git a/config/samples.tsv.testing b/config/samples.tsv.testing deleted file mode 100644 index 2bf83b7..0000000 --- a/config/samples.tsv.testing +++ /dev/null @@ -1,2 +0,0 @@ -sampleName path_to_R1_fastq path_to_R2_fastq -iSLK_KO_c1741 /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_KO_c1741/iSLK_KO_c1741_S16_R1_001.fastq.gz /data/Ziegelbauer_lab/Sarah/MiSeq_R2/Flowcell_000000000-JBHWF/Sample_iSLK_KO_c1741/iSLK_KO_c1741_S16_R2_001.fastq.gz \ No newline at end of file diff --git a/config/tools.yaml b/config/tools.yaml deleted file mode 100644 index 4cb565b..0000000 --- a/config/tools.yaml +++ /dev/null @@ -1,29 +0,0 @@ -bowtie2: - version: "bowtie/2-2.4.2" -cutadapt: - version: "cutadapt/1.18" -# fastqc: -# version: "fastqc/0.11.9" -# hisat: -# version: "hisat/2.2.1.0" -# multiqc: -# version: "multiqc/1.9" -# picard: -# version: "picard/2.25.0" -python37: - version: "python/3.7" -salmon: - version: "salmon/1.4.0" -sambamba: - version: "sambamba/0.8.0" -R: - version: "R/4.0.3" -# samtools: -# version: "samtools/1.11" -# star: -# version: "STAR/2.7.6a" -# alignTranscriptsPerReadNmax: "20000" -# stringtie: -# version: "stringtie/2.1.4" -# ucsc: -# version: "ucsc/407" diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index e69de29..0000000 diff --git a/index.html b/index.html new file mode 100644 index 0000000..fddf223 --- /dev/null +++ b/index.html @@ -0,0 +1,8 @@ + + + Redirect to CCBR/CCBR_circRNA_AmpliconSeq + + + + + \ No newline at end of file diff --git a/mkdocs.yaml b/mkdocs.yaml deleted file mode 100644 index 5aeb0cd..0000000 --- a/mkdocs.yaml +++ /dev/null @@ -1,36 +0,0 @@ -# Project Information -site_name: circRNA ampliconSeq -site_author: Vishal Koparde, Ph.D. -# Repository -repo_name: CCBR/CCBR_circRNA_AmpliconSeq -repo_url: https://github.com/CCBR/CCBR_circRNA_AmpliconSeq - -# Copyright -copyright: Copyright © 2021 CCBR - -# Configuration -theme: - name: material - palette: - scheme: default - -plugins: - - search - -# Customization -extra: - social: - - icon: fontawesome/solid/users - link: http://bioinformatics.cancer.gov - - icon: fontawesome/brands/github - link: https://github.com/CCBR - - icon: fontawesome/brands/docker - link: https://hub.docker.com/orgs/nciccbr/repositories - -# Page Tree -nav: - - Home: index.md - - Flowchart: flowchart.md - - Tutorial: tutorial.md - - References: references.md - - Versions: versions.md diff --git a/resources/TruSeq_and_nextera_adapters.consolidated.fa b/resources/TruSeq_and_nextera_adapters.consolidated.fa deleted file mode 100755 index de67830..0000000 --- a/resources/TruSeq_and_nextera_adapters.consolidated.fa +++ /dev/null @@ -1,94 +0,0 @@ ->Nextera_PrefixNX/1 -AGATGTGTATAAGAGACAG ->Nextera_Trans1 -TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG ->Nextera_Trans1_rc -CTGTCTCTTATACACATCTGACGCTGCCGACGA ->Nextera_Trans2 -GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG ->Nextera_Trans2_rc -CTGTCTCTTATACACATCTCCGAGCCCACGAGAC ->TruSeq3_PE1 -TACACTCTTTCCCTACACGACGCTCTTCCGATCT ->TruSeq3_PE1_rc -AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA ->TruSeq3_PE2 -GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT ->TruSeq3_PE2_rc -AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC ->TruSeq_Small_RNA -TGGAATTCTCGGGTGCCAAGG ->NEB_miRNA_3primeACTGTAGGCACCATCAAT/AACTGTAGGCACCATCAAT -AGATCGGAAGAGCACACGTCT ->Illumina_Single_End_Adapter_1 -GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG ->Illumina_Single_End_Adapter_2 -CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT ->Illumina_Paired_End_Adapter_2 -GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG ->Illumina_Paired_End_PCR_Primer_2 -CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT ->Illumina_Paired_End_Sequencing_Primer_2 -CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT ->Illumina_DpnII_expression_Adapter_1 -ACAGGTTCAGAGTTCTACAGTCCGAC ->Illumina_DpnII_expression_PCR_Primer_1 -CAAGCAGAAGACGGCATACGA ->Illumina_DpnII_expression_PCR_Primer_2 -AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA ->Illumina_NlaIII_expression_Adapter_1 -ACAGGTTCAGAGTTCTACAGTCCGACATG ->Illumina_NlaIII_expression_Sequencing_Primer -CCGACAGGTTCAGAGTTCTACAGTCCGACATG ->Illumina_Multiplexing_Adapter_1 -GATCGGAAGAGCACACGTCT ->Illumina_Multiplexing_Index_Sequencing_Primer -GATCGGAAGAGCACACGTCTGAACTCCAGTCAC ->Illumina_PCR_Primer_Index -CAAGCAGAAGACGGCATACGAGATNNNNNNGTGACTGGAGTTC ->Illumina_DpnII_Gex_Adapter_1 -GATCGTCGGACTGTAGAACTCTGAAC ->Illumina_DpnII_Gex_Adapter_2.01 -TCGTATGCCGTCTTCTGCTTG ->Illumina_DpnII_Gex_Sequencing_Primer -CGACAGGTTCAGAGTTCTACAGTCCGACGATC ->Illumina_NlaIII_Gex_Adapter_1.01 -TCGGACTGTAGAACTCTGAAC ->Illumina_Small_RNA_3p_Adapter_1 -ATCTCGTATGCCGTCTTCTGCTTG ->TruSeq_Universal_Adapter -AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT ->TruSeq_Adapter,_Indices -GATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNNTCTCGTATGCCGTCTTCTGCTTG ->Illumina_RNA_RT_Primer -GCCTTGGCACCCGAGAATTCCA ->Illumina_RNA_PCR_Primer -AATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGA ->RNA_PCR_Primer_Indices -CAAGCAGAAGACGGCATACGAGATNNNNNNGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA ->Illumina_Universal_Adapter -AGATCGGAAGAG ->Illumina_Small_RNA_3'_Adapter -TGGAATTCTCGG ->Illumina_Small_RNA_5'_Adapter -GATCGTCGGACT ->Nextera_Transposase_Sequence -CTGTCTCTTATA ->NEB_miRNA_5prime -GTTCAGAGTTCTACAGTCCGACGATC ->Qiagen_miRNA_5prime -GTTCAGAGTTCTACAGTCCGACGATC ->Qiagen_miRNA_3prime -AACTGTAGGCACCATCAAT ->PolyA -AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA ->PolyC -CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC ->PolyG -GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG ->PolyT -TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT ->Barcode_Index25_F -ACTGAT ->Barcode_Index25_R -ATCAGT \ No newline at end of file diff --git a/resources/bsjs_020921.fasta.gz b/resources/bsjs_020921.fasta.gz deleted file mode 100644 index 4c51520..0000000 Binary files a/resources/bsjs_020921.fasta.gz and /dev/null differ diff --git a/resources/primers.bed b/resources/primers.bed deleted file mode 100644 index 53895fe..0000000 --- a/resources/primers.bed +++ /dev/null @@ -1,16 +0,0 @@ -NC_009333.1 29751 29770 circ29_F . + -NC_009333.1 29641 29663 circ29_R . - -NC_009333.1 57271 57290 circ57_F . + -NC_009333.1 57057 57076 circ57_R . - -NC_009333.1 88288 88307 circvIRF4_F . - -NC_009333.1 88746 88764 circvIRF4_R . + -NC_009333.1 98176 98195 circ97_F . - -NC_009333.1 97964 97983 circ97_R . + -NC_009333.1 118221 118240 circT0.7_F3 . + -NC_009333.1 117701 117722 circT0.7_R2 . - -NC_009333.1 123582 123601 circORF72_F3 . + -NC_009333.1 123138 123157 circORF72_R1 . - -chr4 37631449 37631468 circ1400_F . - -chr4 37638469 37638488 circ1400_R . + -chr7 129015044 129015063 circ1741_F . - -chr7 129018113 129018132 circ1741_R . + diff --git a/resources/primers.tsv b/resources/primers.tsv deleted file mode 100644 index 4c33ca6..0000000 --- a/resources/primers.tsv +++ /dev/null @@ -1,17 +0,0 @@ -Primer Chromosome Start End Strand PrimerSequence_5prime_to_3prime Reference -circ29_F NC_009333.1 29751 29770 + ACCAGACGGCAAGGTTTTTA https://doi.org/10.1073/pnas.1816183115 -circ29_R NC_009333.1 29641 29663 - TCGTTAGTCAACCTAGCAAAACA https://doi.org/10.1073/pnas.1816183115 -circ57_F NC_009333.1 57271 57290 + CAACCAAAAGGCAGAGTCGT https://doi.org/10.1073/pnas.1816183115 -circ57_R NC_009333.1 57057 57076 - GGCTGAACCCAAGAACTTCA https://doi.org/10.1073/pnas.1816183115 -circvIRF4_F NC_009333.1 88288 88307 - CTCCGTGTGGATACCAGTGA 10.1128/JVI.01952-18 -circvIRF4_R NC_009333.1 88746 88764 + TGGTTCCACGCAACAGTCT 10.1128/JVI.01952-18 -circ97_F NC_009333.1 98176 98195 - GGAAGAAGCTCATGGACTGG https://doi.org/10.1073/pnas.1816183115 -circ97_R NC_009333.1 97964 97983 + GACCTAAAAACCCGGAGGAG https://doi.org/10.1073/pnas.1816183115 -circT0.7_F3 NC_009333.1 118221 118240 + AGTGAGGAGGGAGGAGGGCA -circT0.7_R2 NC_009333.1 117701 117722 - CGCTCTCCCAAACCACACGAAT -circORF72_F3 NC_009333.1 123582 123601 + AAGATTAAGGGCCAACGCGA -circORF72_R1 NC_009333.1 123138 123157 - TAGTTCCTCAGCTGGCAAGC -circ1400_F chr4 37631449 37631468 - ATGTCTGTTAGTGGGGCTGA https://doi.org/10.1073/pnas.1816183115 -circ1400_R chr4 37638469 37638488 + TATCTGCTACCATCGCCTTT https://doi.org/10.1073/pnas.1816183115 -circ1741_F chr7 129015044 129015063 - TCGTTCCTTACGAATTGGAG https://doi.org/10.1073/pnas.1816183115 -circ1741_R chr7 129018113 129018132 + CTGCCGGATCTGTAACAACT https://doi.org/10.1073/pnas.1816183115 diff --git a/run_circRNA_AmpliconSeq b/run_circRNA_AmpliconSeq deleted file mode 100755 index 6274795..0000000 --- a/run_circRNA_AmpliconSeq +++ /dev/null @@ -1,303 +0,0 @@ -#!/usr/bin/env bash -# Author: Vishal Koparde, Ph.D. -# CCBR, NCI -# (c) 2021 -# -# wrapper script to run the circRNA AmpliconSeq pipeline -# -# -# https://github.com/CCBR/CCBR_circRNA_AmpliconSeq - -set -eo pipefail -module purge - -SCRIPTNAME="$0" -SCRIPTDIRNAME=$(readlink -f $(dirname $0)) -SCRIPTBASENAME=$(readlink -f $(basename $0)) - -EXTRA_SINGULARITY_BINDS="-B /data/CCBR_Pipeliner/:/data/CCBR_Pipeliner/ -B /data/Ziegelbauer_lab/resources/:/data/Ziegelbauer_lab/resources/" - -function get_git_commitid_tag() { - cd $1 - gid=$(git rev-parse HEAD) - tag=$(git describe --tags $gid 2>/dev/null) - echo -ne "$gid\t$tag" -} - -# ## setting PIPELINE_HOME -PIPELINE_HOME=$(readlink -f $(dirname "$0")) -echo "Pipeline Dir: $PIPELINE_HOME" -# set snakefile -SNAKEFILE="${PIPELINE_HOME}/workflow/Snakefile" - -# get github commit tag -GIT_COMMIT_TAG=$(get_git_commitid_tag $PIPELINE_HOME) -echo "Git Commit/Tag: $GIT_COMMIT_TAG" - -function usage() { cat << EOF - -${SCRIPTBASENAME} ---> run circRNA AmpliconSeq Pipeline - -USAGE: - bash ${SCRIPTNAME} -m/--runmode= -w/--workdir= -Required Arguments: -1. RUNMODE: [Type: String] Valid options: - *) init : initialize workdir - *) run : run with slurm - *) reset : DELETE workdir dir and re-init it - *) dryrun : dry run snakemake to generate DAG - *) unlock : unlock workdir if locked by snakemake - *) runlocal : run without submitting to sbatch -2. WORKDIR: [Type: String]: Absolute or relative path to the output folder with write permissions. -EOF -} - -function err() { cat <<< " -# -# -# - $@ -# -# -# -" && usage && exit 1 1>&2; } - - -function init() { - -# create output folder -if [ -d $WORKDIR ];then err "Folder $WORKDIR already exists!"; fi -mkdir -p $WORKDIR - -# copy config and samples files -sed -e "s/PIPELINE_HOME/${PIPELINE_HOME//\//\\/}/g" -e "s/WORKDIR/${WORKDIR//\//\\/}/g" ${PIPELINE_HOME}/config/config.yaml > $WORKDIR/config.yaml -cp ${PIPELINE_HOME}/config/samples.tsv $WORKDIR/ - -#create log and stats folders -if [ ! -d $WORKDIR/logs ]; then mkdir -p $WORKDIR/logs;echo "Logs Dir: $WORKDIR/logs";fi -if [ ! -d $WORKDIR/stats ];then mkdir -p $WORKDIR/stats;echo "Stats Dir: $WORKDIR/stats";fi - -echo "Done Initializing $WORKDIR. You can now edit $WORKDIR/config.yaml and $WORKDIR/samples.tsv" - -} - -function check_essential_files() { - if [ ! -d $WORKDIR ];then err "Folder $WORKDIR does not exist!"; fi - for f in config.yaml samples.tsv; do - if [ ! -f $WORKDIR/$f ]; then err "Error: '${f}' file not found in workdir ... initialize first!";fi - done -} - -function reconfig(){ - # rebuild config file and replace the config.yaml in the WORKDIR - # this is only for dev purposes when new key-value pairs are being added to the config file - check_essential_files - sed -e "s/PIPELINE_HOME/${PIPELINE_HOME//\//\\/}/g" -e "s/WORKDIR/${WORKDIR//\//\\/}/g" ${PIPELINE_HOME}/config/config.yaml > $WORKDIR/config.yaml - echo "$WORKDIR/config.yaml has been updated!" -} - -function runcheck(){ - check_essential_files - module load python/3.7 - module load snakemake/5.24.1 - SINGULARITY_BINDS="$EXTRA_SINGULARITY_BINDS -B ${PIPELINE_HOME}:${PIPELINE_HOME} -B ${WORKDIR}:${WORKDIR}" -} - -function dryrun() { - runcheck - run "--dry-run" -} - -function unlock() { - runcheck - run "--unlock" -} - -function runlocal() { - runcheck - if [ "$SLURM_JOB_ID" == "" ];then err "runlocal can only be done on an interactive node"; fi - module load singularity - run "local" -} - -function runslurm() { - runcheck - run "slurm" -} - -function preruncleanup() { - echo "Running..." - - # check initialization - check_essential_files - - cd $WORKDIR - ## Archive previous run files - if [ -f ${WORKDIR}/snakemake.log ];then - modtime=$(stat ${WORKDIR}/snakemake.log |grep Modify|awk '{print $2,$3}'|awk -F"." '{print $1}'|sed "s/ //g"|sed "s/-//g"|sed "s/://g") - mv ${WORKDIR}/snakemake.log ${WORKDIR}/stats/snakemake.${modtime}.log - if [ -f ${WORKDIR}/snakemake.log.HPC_summary.txt ];then - mv ${WORKDIR}/snakemake.log.HPC_summary.txt ${WORKDIR}/stats/snakemake.${modtime}.log.HPC_summary.txt - fi - if [ -f ${WORKDIR}/snakemake.stats ];then - mv ${WORKDIR}/snakemake.stats ${WORKDIR}/stats/snakemake.${modtime}.stats - fi - fi - nslurmouts=$(find ${WORKDIR} -maxdepth 1 -name "slurm-*.out" |wc -l) - if [ "$nslurmouts" != "0" ];then - for f in $(ls ${WORKDIR}/slurm-*.out);do mv ${f} ${WORKDIR}/logs/;done - fi - -} - - -function run() { - - - if [ "$1" == "local" ];then - - preruncleanup - - snakemake -s $SNAKEFILE \ - --directory $WORKDIR \ - --printshellcmds \ - --use-singularity \ - --singularity-args "$SINGULARITY_BINDS" \ - --use-envmodules \ - --latency-wait 120 \ - --configfile ${WORKDIR}/config.yaml \ - --cores all \ - --stats ${WORKDIR}/snakemake.stats \ - 2>&1|tee ${WORKDIR}/snakemake.log - - if [ "$?" -eq "0" ];then - snakemake -s $SNAKEFILE \ - --report ${WORKDIR}/runlocal_snakemake_report.html \ - --directory $WORKDIR \ - --configfile ${WORKDIR}/config.yaml - fi - - elif [ "$1" == "slurm" ];then - - preruncleanup - - cat > ${WORKDIR}/submit_script.sbatch << EOF -#!/bin/bash -#SBATCH --job-name="circRNA_ampliconseq" -#SBATCH --mem=10g -#SBATCH --partition="ccr,norm" -#SBATCH --time=96:00:00 -#SBATCH --cpus-per-task=2 - -module load python/3.7 -module load snakemake/5.24.1 -module load singularity - -cd \$SLURM_SUBMIT_DIR - -snakemake -s $SNAKEFILE \ ---directory $WORKDIR \ ---use-singularity \ ---singularity-args "$SINGULARITY_BINDS" \ ---use-envmodules \ ---printshellcmds \ ---latency-wait 120 \ ---configfile ${WORKDIR}/config.yaml \ ---cluster-config ${PIPELINE_HOME}/config/cluster.json \ ---cluster "sbatch --gres {cluster.gres} --cpus-per-task {cluster.threads} -p {cluster.partition} -t {cluster.time} --mem {cluster.mem} --job-name {cluster.name} --output {cluster.output} --error {cluster.error} --qos={cluster.qos}" \ --j 500 \ ---rerun-incomplete \ ---keep-going \ ---stats ${WORKDIR}/snakemake.stats \ -2>&1|tee ${WORKDIR}/snakemake.log - -if [ "\$?" -eq "0" ];then - snakemake -s $SNAKEFILE \ - --directory $WORKDIR \ - --report ${WORKDIR}/runslurm_snakemake_report.html \ - --configfile ${WORKDIR}/config.yaml -fi - -bash <(curl https://raw.githubusercontent.com/CCBR/Tools/master/Biowulf/gather_cluster_stats.sh 2>/dev/null) ${WORKDIR}/snakemake.log > ${WORKDIR}/snakemake.log.HPC_summary.txt - -EOF - - sbatch ${WORKDIR}/submit_script.sbatch - - else # for unlock and dryrun - -snakemake $1 -s $SNAKEFILE \ ---directory $WORKDIR \ ---use-envmodules \ ---printshellcmds \ ---latency-wait 120 \ ---configfile ${WORKDIR}/config.yaml \ ---cluster-config ${PIPELINE_HOME}/config/cluster.json \ ---cluster "sbatch --gres {cluster.gres} --cpus-per-task {cluster.threads} -p {cluster.partition} -t {cluster.time} --mem {cluster.mem} --job-name {cluster.name} --output {cluster.output} --error {cluster.error}" \ --j 500 \ ---rerun-incomplete \ ---keep-going \ ---stats ${WORKDIR}/snakemake.stats - - fi - -} - -function reset() { - #delete the workdir and re-initialize it - echo "Working Dir: $WORKDIR" - if [ ! -d $WORKDIR ];then err "Folder $WORKDIR does not exist!";fi - echo "Deleting $WORKDIR" - rm -rf $WORKDIR - echo "Re-Initializing $WORKDIR" - init -} - - -function main(){ - - if [ $# -eq 0 ]; then usage; exit 1; fi - - for i in "$@" - do - case $i in - -m=*|--runmode=*) - RUNMODE="${i#*=}" - ;; - -w=*|--workdir=*) - WORKDIR="${i#*=}" - ;; - *) - err "Unknown argument!" # unknown option - ;; - esac - done - WORKDIR=$(readlink -f "$WORKDIR") - echo "Working Dir: $WORKDIR" - # echo SCRIPTNAME = ${SCRIPTNAME} - # echo RUNMODE = ${RUNMODE} - # echo WORKDIR = ${WORKDIR} - # exit; - - - case $RUNMODE in - init) init && exit 0;; - dryrun) dryrun && exit 0;; - unlock) unlock && exit 0;; - run) runslurm && exit 0;; - runlocal) runlocal && exit 0;; - reset) reset && exit 0;; - dry) dryrun && exit 0;; # hidden option - local) runlocal && exit 0;; # hidden option - reconfig) reconfig && exit 0;; # hidden option for debugging - *) err "Unknown RUNMODE \"$RUNMODE\"";; - esac -} - -main "$@" - - - - - diff --git a/workflow/Snakefile b/workflow/Snakefile deleted file mode 100644 index f99ab8b..0000000 --- a/workflow/Snakefile +++ /dev/null @@ -1,26 +0,0 @@ -from snakemake.utils import min_version -min_version("5.24.1") - -from os.path import join - -include: join("rules/init.smk") -include: join("rules/trim.smk") -include: join("rules/align.smk") -include: join("rules/quant.smk") - - -localrules: all - -rule all: - input: - #trim - expand(join(WORKDIR,"results","{sample}","trim","{sample}.R1.trim.fastq.gz"),sample=SAMPLES), - expand(join(WORKDIR,"results","{sample}","trim","{sample}.R2.trim.fastq.gz"),sample=SAMPLES), - #align - expand(join(WORKDIR,"results","{sample}","bam","{sample}.bam"),sample=SAMPLES), - #quant - join(WORKDIR,"results","mergedquant.tsv"), - join(WORKDIR,"results","mergedquant.filtered.tsv") - - - diff --git a/workflow/envs/dummy_env.yaml b/workflow/envs/dummy_env.yaml deleted file mode 100644 index e69de29..0000000 diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk deleted file mode 100644 index 70e09e1..0000000 --- a/workflow/rules/align.smk +++ /dev/null @@ -1,90 +0,0 @@ -# if config['bsjfa']: -# BSJFA=config['bsjfa'] -# else: -# BSJFA=config['bsjfadefault'] -# check_readaccess(BSJFA) -PRIMERINFO=config['primertsv'] -check_readaccess(PRIMERINFO) -REFFA=config['reffa'] -check_readaccess(REFFA) - -rule create_primerbed: - input: - primertsv=PRIMERINFO - output: - primerbed=join(WORKDIR,"bowtie_index","primers.bed") - params: - script=join(SCRIPTSDIR,"build_primer_bed.py") - envmodules: TOOLS["python37"]["version"] - shell:""" -python {params.script} {input.primertsv} > {output.primerbed} -""" - -rule create_bsjfa: - input: - bed=rules.create_primerbed.output.primerbed, - reffa=REFFA - output: - bsjfa=join(WORKDIR,"bowtie_index","bsj.fa") - params: - script=join(SCRIPTSDIR,"generate_bsj_fasta_from_primer_bed.py") - container: "docker://nciccbr/ccbr_htseq_0.13.5:latest" - shell:""" -python {params.script} \ ---bed {input.bed} \ ---reffa {input.reffa} \ ---outfa {output.bsjfa} -""" - -rule build_index: - input: - bsjfa=rules.create_bsjfa.output.bsjfa, - output: - join(WORKDIR,"bowtie_index","bsj.1.bt2") - params: - outdir=join(WORKDIR,"bowtie_index") - envmodules: TOOLS["bowtie2"]["version"] - threads: 2 - shell: """ -# cp {input.bsjfa} {params.outdir} -cd {params.outdir} -bsjfa=$(basename {input.bsjfa}) -bowtie2-build $bsjfa bsj -# gzip -n -f {input.bsjfa} -""" - -rule align: - input: - r1=rules.cutadapt.output.of1, - r2=rules.cutadapt.output.of2, - bt2_index=rules.build_index.output, - output: - bam=join(WORKDIR,"results","{sample}","bam","{sample}.bam") - params: - sample="{sample}", - workdir=WORKDIR, - outdir=join(WORKDIR,"results","{sample}","bowtie2"), - peorse=get_peorse, - bowtie2_alignment_parameters=config["bowtie2_alignment_parameters"], - mem=MEMORY - threads: 56 - envmodules: TOOLS["bowtie2"]["version"], TOOLS["sambamba"]["version"] - shell:""" -bt2_index=$(echo "{input.bt2_index}"|awk -F".1.bt2" '{{print $1}}') -if [ "{params.peorse}" == "PE" ];then - bowtie2 \ - --threads {threads} \ - {params.bowtie2_alignment_parameters} \ - -x $bt2_index \ - -U {input.r1},{input.r2} -else - bowtie2 \ - --threads {threads} \ - {params.bowtie2_alignment_parameters} \ - -x $bt2_index \ - -U {input.r1} -fi \ -| awk -F"\\t" '{{if ($6 ~ /60M/ || $1 ~ /^@/){{print}}}}' \ -| sambamba view --nthreads={threads} -S --format=bam -o=/dev/stdout /dev/stdin \ -| sambamba sort --memory-limit={params.mem}G --tmpdir=/dev/shm --nthreads={threads} --out={output.bam} /dev/stdin -""" diff --git a/workflow/rules/init.smk b/workflow/rules/init.smk deleted file mode 100644 index fb87a25..0000000 --- a/workflow/rules/init.smk +++ /dev/null @@ -1,96 +0,0 @@ -import shutil -import sys -import os -import pandas as pd -import yaml -import glob - -CONFIGFILE = str(workflow.overwrite_configfiles[0]) - -def check_existence(filename): - """Checks if file exists on filesystem - :param filename : Name of file to check - """ - filename=filename.strip() - if not os.path.exists(filename): - sys.exit("File: {} does not exists!".format(filename)) - return True - - -def check_readaccess(filename): - """Checks permissions to see if user can read a file - :param filename : Name of file to check - """ - filename=filename.strip() - check_existence(filename) - if not os.access(filename,os.R_OK): - sys.exit("File: {} exists, but user cannot read from file due to permissions!".format(filename)) - return True - - -def check_writeaccess(filename): - """Checks permissions to see if user can write to a file - :param filename : Name of file to check - """ - filename=filename.strip() - check_existence(filename) - if not os.access(filename,os.W_OK): - sys.exit("File: {} exists, but user cannot write to file due to permissions!".format(filename)) - return True - - -# -MEMORY="100" -# get working dir from config -WORKDIR = config["workdir"] - -# get resources folder -try: - RESOURCESDIR = config["resourcesdir"] -except KeyError: - RESOURCESDIR = join(WORKDIR,"resources") - -# get scripts folder -try: - SCRIPTSDIR = config["scriptsdir"] -except KeyError: - SCRIPTSDIR = join(WORKDIR,"scripts") - -## Load tools from YAML file -with open(config["tools"]) as f: - TOOLS = yaml.safe_load(f) - -if not os.path.exists(join(WORKDIR,"fastqs")): - os.mkdir(join(WORKDIR,"fastqs")) -if not os.path.exists(join(WORKDIR,"results")): - os.mkdir(join(WORKDIR,"results")) -for f in ["samples", "tools", "cluster"]: - check_readaccess(config[f]) - -SAMPLESDF = pd.read_csv(config["samples"],sep="\t",header=0,index_col="sampleName") -SAMPLES = list(SAMPLESDF.index) -SAMPLESDF["R1"]=join(RESOURCESDIR,"dummy") -SAMPLESDF["R2"]=join(RESOURCESDIR,"dummy") -SAMPLESDF["PEorSE"]="PE" - -for sample in SAMPLES: - R1file=SAMPLESDF["path_to_R1_fastq"][sample] - R2file=SAMPLESDF["path_to_R2_fastq"][sample] - # print(sample,R1file,R2file) - check_readaccess(R1file) - R1filenewname=join(WORKDIR,"fastqs",sample+".R1.fastq.gz") - if not os.path.exists(R1filenewname): - os.symlink(R1file,R1filenewname) - SAMPLESDF.loc[[sample],"R1"]=R1filenewname - if str(R2file)!='nan': - check_readaccess(R2file) - R2filenewname=join(WORKDIR,"fastqs",sample+".R2.fastq.gz") - if not os.path.exists(R2filenewname): - os.symlink(R2file,R2filenewname) - SAMPLESDF.loc[[sample],"R2"]=R2filenewname - else: - SAMPLESDF.loc[[sample],"PEorSE"]="SE" -# print(SAMPLESDF) -# sys.exit() - - diff --git a/workflow/rules/quant.smk b/workflow/rules/quant.smk deleted file mode 100644 index 0e85878..0000000 --- a/workflow/rules/quant.smk +++ /dev/null @@ -1,53 +0,0 @@ -BSJFA=rules.create_bsjfa.output.bsjfa - -rule salmon: - input: - bam=rules.align.output.bam, - reffa=BSJFA, - output: - sf=join(WORKDIR,"results","{sample}","salmon","quant.sf") - params: - salmon_parameters=config["salmon_parameters"] - envmodules: TOOLS["salmon"]["version"] - shell:""" -outdir=$(dirname {output.sf}) -salmon quant \ ---threads {threads} \ -{params.salmon_parameters} \ --t {input.reffa} \ --a {input.bam} \ ---output $outdir -""" - -rule aggregate_quant: - input: - expand(join(WORKDIR,"results","{sample}","salmon","quant.sf"),sample=SAMPLES) - output: - mergedquant=join(WORKDIR,"results","mergedquant.tsv"), - filteredmergedquant=join(WORKDIR,"results","mergedquant.filtered.tsv") - params: - rowsumfilter=config['aggregate_quant_rowsum_filter'], - plotsdir=join(WORKDIR,"results","pieplots"), - rscript=join(SCRIPTSDIR,"pie_plots.R") - envmodules: TOOLS["salmon"]["version"],TOOLS["R"]["version"] - shell:""" -names="" -quants="" -for i in {input};do - outdir=$(dirname $i) - quants="$quants $outdir" - samplename=$(echo $outdir|awk -F"/" '{{print $(NF-1)}}') - names="$names $samplename" -done -salmon quantmerge --quants $quants --names $names --column numreads -o {output.mergedquant} -head -n1 {output.mergedquant} > {output.filteredmergedquant} -tail -n +2 {output.mergedquant} |\ -awk -F"\\t" -v r={params.rowsumfilter} -v OFS="\\t" '{{for(i=2;i<=NF;i++){{sum[NR]=sum[NR]+$i}};if (sum[NR] >= r) {{print sum[NR],$0}}}}' |\ -sort -k1,1gr |\ -cut -f2- >> {output.filteredmergedquant} -if [ -d {params.plotsdir} ];then rm -rf {params.plotsdir};fi && mkdir {params.plotsdir} && cd {params.plotsdir} && cp {output.filteredmergedquant} . && \ -Rscript {params.rscript} $(basename {output.filteredmergedquant}) && rm -f $(basename {output.filteredmergedquant}) -""" - - - \ No newline at end of file diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk deleted file mode 100644 index 9fe2ff5..0000000 --- a/workflow/rules/trim.smk +++ /dev/null @@ -1,51 +0,0 @@ -def get_fastqs(wildcards): - d=dict() - d["R1"]=SAMPLESDF["R1"][wildcards.sample] - d["R2"]=SAMPLESDF["R2"][wildcards.sample] - return d - -def get_peorse(wildcards): - return SAMPLESDF["PEorSE"][wildcards.sample] - -rule cutadapt: - input: - unpack(get_fastqs) - output: - of1=join(WORKDIR,"results","{sample}","trim","{sample}.R1.trim.fastq.gz"), - of2=join(WORKDIR,"results","{sample}","trim","{sample}.R2.trim.fastq.gz") - params: - sample="{sample}", - workdir=WORKDIR, - outdir=join(WORKDIR,"results","{sample}"), - peorse=get_peorse, - adapters=join(RESOURCESDIR,"TruSeq_and_nextera_adapters.consolidated.fa") - envmodules: TOOLS["cutadapt"]["version"] - threads: 56 - shell:""" -if [ ! -d {params.outdir} ];then mkdir {params.outdir};fi -if [ "{params.peorse}" == "PE" ];then - ## Paired-end - cutadapt --pair-filter=any \ - --nextseq-trim=2 \ - --trim-n \ - -n 5 -O 5 \ - -q 10,10 -m 35:35 \ - -b file:{params.adapters} \ - -B file:{params.adapters} \ - -j {threads} \ - -o {output.of1} -p {output.of2} \ - {input.R1} {input.R2} -else - ## Single-end - cutadapt \ - --nextseq-trim=2 \ - --trim-n \ - -n 5 -O 5 \ - -q 10,10 -m 35 \ - -b file:{params.adapters} \ - -j {threads} \ - -o {output.of1} \ - {input.R1} - touch {output.of2} -fi -""" \ No newline at end of file diff --git a/workflow/scripts/build_primer_bed.py b/workflow/scripts/build_primer_bed.py deleted file mode 100644 index bebc7f2..0000000 --- a/workflow/scripts/build_primer_bed.py +++ /dev/null @@ -1,35 +0,0 @@ -import sys -primerstsv=open(sys.argv[1]).readlines() -primerstsv.pop(0) -primers=dict() -for i in primerstsv: - i=i.strip().split("\t") - circRNAnamesplit=i[0].split("_") - circRNAname="_".join(circRNAnamesplit[:-1]) - if not circRNAname in primers: - primers[circRNAname]=dict() - primers[circRNAname]["coordinates"]=list() - ForR=circRNAnamesplit[-1][0] - primers[circRNAname]["chrom"]=i[1] - strand=i[4] - primers[circRNAname]["coordinates"].append(int(i[2])) - primers[circRNAname]["coordinates"].append(int(i[3])) - primers[circRNAname]["coordinates"].sort() - if ForR == "F": - primers[circRNAname]["strand"]=strand - primers[circRNAname]["ftype"]=circRNAnamesplit[-1] - if strand == "+": - primers[circRNAname]["ASorSS"]="SS" - elif strand == "-": - primers[circRNAname]["ASorSS"]="AS" - elif ForR == "R": - primers[circRNAname]["rtype"]=circRNAnamesplit[-1] -for primer in primers: - sname=primer+"_"+primers[primer]["ftype"]+primers[primer]["rtype"]+"##"+primers[primer]["ASorSS"] - #sname="_".join([primer,primers[primer]["ASorSS"]) - #print(sname) - chrom=primers[primer]["chrom"] - start=str(primers[primer]["coordinates"][0]) - end=str(primers[primer]["coordinates"][-1]) - strand=primers[primer]["strand"] - print("\t".join([chrom,start,end,sname,".",strand])) diff --git a/workflow/scripts/gather_cluster_stats.sh b/workflow/scripts/gather_cluster_stats.sh deleted file mode 100644 index 49c326e..0000000 --- a/workflow/scripts/gather_cluster_stats.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash -## AUTHOR : Vishal N. Koparde, Ph.D., CCBR, NCI -## DATE : Feb 2021 -## This scripts gathers cluster related statistics for jobs run on Biowulf using Snakemake by: -## > extracting "external" jobids from snakemake.log files -## > gather cluster stats for each job using "jobdata" and "jobhist" commands -## > sorts output by job submission time -## > output TSV file -## - -function node2runpartition { - node=$1 - partitions_requested=$2 - run_partition=$(for p in `echo $partitions_requested|awk '{print $NF}'|tr "," " "`;do if [ "$(freen -N $node|grep $p|awk '{print $1}'|grep $p|wc -l)" == "1" ]; then echo $p;break 1;fi;done) - if [ "$run_partition" == "" ];then - echo "unknown" - else - echo "$run_partition" - fi -} - - -function get_jobid_stats { -jobid=$1 -declare -A jobdataarray -notaccountablejob=$(jobhist $jid|grep "No accounting"|wc -l) -if [ "$notaccountablejob" == "1" ];then - jobdataarray["submit_time"]="JOBNOTACCOUNTABLE" - jobdataarray["jobid"]="$jobid" -else - jobdata $jobid > ${jobid}.tmp - awk -F"\t" '{if (NF==2) {print}}' ${jobid}.tmp > ${jobid}.data && rm -f ${jobid}.tmp - while read a b;do - jobdataarray["$a"]="$b" - done < ${jobid}.data - rm -f ${jobid}.data - st=${jobdataarray["submit_time"]} - jobdataarray["human_submit_time"]=$(date -d @$st|sed "s/ /_/g") - jobdataarray["alloc_node_partition"]=$(node2runpartition ${jobdataarray["alloc_node"]} ${jobdataarray["partition"]}) - jobdataarray["run_node_partition"]=$(node2runpartition ${jobdataarray["node_list"]} ${jobdataarray["partition"]}) -fi -echo -ne "${jobdataarray["submit_time"]}\t" -echo -ne "${jobdataarray["human_submit_time"]}\t" -echo -ne "${jobdataarray["jobid"]}:${jobdataarray["state"]}:${jobdataarray["job_name"]}\t" -echo -ne "${jobdataarray["alloc_node"]}:${jobdataarray["alloc_node_partition"]}:${jobdataarray["node_list"]}:${jobdataarray["run_node_partition"]}\t" -echo -ne "${jobdataarray["queued"]}:${jobdataarray["elapsed"]}:${jobdataarray["time_limit"]}\t" -echo -ne "${jobdataarray["avg_cpus"]}:${jobdataarray["max_cpu_used"]}:${jobdataarray["cpus_per_task"]}\t" -echo -ne "${jobdataarray["avg_mem"]}:${jobdataarray["max_mem_used"]}:${jobdataarray["total_mem"]}\t" -echo -ne "${jobdataarray["partition"]}:${jobdataarray["qos"]}\t" -echo -ne "${jobdataarray["username"]}:${jobdataarray["groupname"]}:${jobdataarray["account"]}\t" -echo -ne "${jobdataarray["work_dir"]}\t" -echo -ne "${jobdataarray["std_out"]}\t" -echo -ne "${jobdataarray["std_err"]}\n" -} - -if [ "$#" != "1" ];then - echo " bash $0 " - exit 1 -fi - -snakemakelogfile=$1 -grep "with external jobid" $snakemakelogfile | awk '{print $NF}' | sed "s/['.]//g" | sort | uniq > ${snakemakelogfile}.jobids.lst -echo -ne "##SubmitTime\tHumanSubmitTime\tJobID:JobState:JobName\tAllocNode:AllocNodePartition:RunNode:RunNodePartition\tQueueTime:RunTime:TimeLimit\tAvgCPU:MaxCPU:CPULimit\tAvgMEM:MaxMEM:MEMLimit\tPartition:QOS\tUsername:Group:Account\tWorkdir\tStdOut\tStdErr\n" -while read jid;do - get_jobid_stats $jid -done < ${snakemakelogfile}.jobids.lst |sort -k1,1n -rm -f ${snakemakelogfile}.jobids.lst \ No newline at end of file diff --git a/workflow/scripts/generate_bsj_fasta_from_primer_bed.py b/workflow/scripts/generate_bsj_fasta_from_primer_bed.py deleted file mode 100644 index f164634..0000000 --- a/workflow/scripts/generate_bsj_fasta_from_primer_bed.py +++ /dev/null @@ -1,106 +0,0 @@ -import HTSeq -import sys -import argparse -import os - -def read_bed_file(filename): - bedfile=open(filename,'r') - primers=dict() - for f in bedfile.readlines(): - f=f.strip().split("\t") - primer=f[3] - primers[primer]=dict() - primers[primer]["chrom"]=f[0] - primers[primer]["start"]=int(f[1]) - primers[primer]["end"]=int(f[2]) - primers[primer]["strand"]=f[5] - return primers - -def complement(c): - if c=="A": - return "T" - elif c=="C": - return "G" - elif c=="G": - return "C" - elif c=="T": - return "A" - elif c=="N": - return "N" - else: - print("Unknown char %s. expecting A,C,G,T or N"%(c)) - exit() - -def revcom(seq): - rc=seq[::-1].upper() - rc="".join(list(map(lambda x:complement(x),rc))) - return rc - -def convertnt(c): - if c=="A": - return "1" - elif c=="C": - return "4" - elif c=="G": - return "3" - elif c=="T": - return "2" - elif c=="N": - return "0" - else: - return "-1" - -parser = argparse.ArgumentParser(description='Create fasta file from divergent primers bed file') -parser.add_argument('--bed', dest='primerbed', type=str, required=True, - help='Divergent primers bed file with min. 4 columns (chr,start,end,name), name is expected to have _AS/_SS suffix') -parser.add_argument('--reffa', dest='reffa', type=str, required=True, - help='reference fasta') -parser.add_argument('--outfa', dest='outfa', type=str, required=True, - help='output fasta') -parser.add_argument('--scanlength', dest='scanlen', type=int, required=False, default=200, - help='scan length...even positive number(default 200)') -parser.add_argument('--flankmax', dest='flankmax', type=int, required=False, default=30, - help='flankmax (default 30)') -args = parser.parse_args() -# sequences = dict( (s.name, s) for s in HTSeq.FastaReader(args.reffa) ) -sequences = dict( (s[1], s[0]) for s in HTSeq.FastaReader(args.reffa, raw_iterator=True) ) -# sequences = dict( (s[1], s[0]) for s in HTSeq.FastaReader(args.reffa) ) -primers = read_bed_file(args.primerbed) -# print(primers) -# for primer in primers: -# print(primers[primer]["chrom"]) -outfastafile = open( args.outfa, "w" ) -offset = "N"*300 -for primer in primers: - ASorSS=primer.split("##")[-1] - seq=sequences[primers[primer]["chrom"]] - for i in range(primers[primer]["start"]-args.scanlen,primers[primer]["start"],1): - for j in range(primers[primer]["end"],primers[primer]["end"]+args.scanlen,1): - k=int((j-i)/2) - if k >= args.flankmax: - bsjflanka=seq[ j - args.flankmax : j ] - bsjflankb=seq[ i : i + args.flankmax ] - else: - bsjflanka=seq[ j-k : j ] - bsjflankb=seq[ i : i+k ] - if ASorSS == "SS": - five_da=bsjflanka[-2:]+"-"+seq[j:j+2] - three_da=seq[i-2:i]+"-"+bsjflankb[:2] - da=seq[j:j+2]+"-"+seq[i-2:i] - elif ASorSS == "AS": - five_da=revcom(seq[i-2:i])+"-"+revcom(bsjflankb[:2]) - three_da=revcom(bsjflanka[-2:])+"-"+revcom(seq[j:j+2]) - da=revcom(seq[i-2:i])+"-"+revcom(seq[j:j+2]) - else: - print("Primer %s does not have AS or SS suffix"%(primer)) - exit() - bsjflank=bsjflanka+bsjflankb - bsjflank=bsjflank.upper() - bsjflank_nt="".join(list(map(lambda x:convertnt(x),bsjflank))) - sname = "##".join([primer,primers[primer]["chrom"],str(i),str(j),five_da,three_da,da,bsjflank_nt]) - # print(sname) - # print(bsjflank) - myseq = HTSeq.Sequence( bytes(offset+bsjflank+offset, 'utf-8'), sname) - myseq.write_to_fasta_file(outfastafile) -outfastafile.close() - diff --git a/workflow/scripts/get_donor_acceptor_from_salmon_quant.py b/workflow/scripts/get_donor_acceptor_from_salmon_quant.py deleted file mode 100644 index 7640899..0000000 --- a/workflow/scripts/get_donor_acceptor_from_salmon_quant.py +++ /dev/null @@ -1,34 +0,0 @@ -import HTSeq -import sys -import argparse -import os - -def read_bed_file(filename): - bedfile=open(filename,'r') - primers=dict() - for f in bedfile.readlines(): - f=f.strip().split("\t") - primer=f[3] - primers[primer]=dict() - primers[primer]["chrom"]=f[0] - primers[primer]["start"]=int(f[1]) - primers[primer]["end"]=int(f[2]) - primers[primer]["strand"]=f[5] - return primers - -parser = argparse.ArgumentParser(description='Create fasta file from divergent primers bed file') -parser.add_argument('--bed', dest='primerbed', type=str, required=True, - help='Divergent primers bed file with min. 4 columns (chr,start,end,name) name is expected to have _AS/_SS suffix') -parser.add_argument('--reffa', dest='reffa', type=str, required=True, - help='reference fasta') -parser.add_argument('--outfa', dest='outfa', type=str, required=True, - help='output fasta') -parser.add_argument('--scanlength', dest='scanlen', type=int, required=False, default=200, - help='scan length...even positive number(default 200)') -parser.add_argument('--flankmax', dest='flankmax', type=int, required=False, default=30, - help='flankmax (default 30)') -args = parser.parse_args() -# sequences = dict( (s.name, s) for s in HTSeq.FastaReader(args.reffa) ) -sequences = dict( (s[1], s[0]) for s in HTSeq.FastaReader(args.reffa, raw_iterator=True) ) -# sequences = dict( (s[1], s[0]) for s in HTSeq.FastaReader(args.reffa) ) -primers = read_bed_file(args.primerbed) \ No newline at end of file diff --git a/workflow/scripts/pie_plots.R b/workflow/scripts/pie_plots.R deleted file mode 100644 index 2f32303..0000000 --- a/workflow/scripts/pie_plots.R +++ /dev/null @@ -1,95 +0,0 @@ - -rm(list=ls()) -library("tidyverse") -library("RColorBrewer") -n <- 101 -qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',] -col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals))) - -read_data_table <-function(fname,circRNA){ - d=read.csv(fname,sep="\t",header=TRUE) - samplelist=colnames(d)[2:length(colnames(d))] - d %>% separate(col="Name",into=c("circRNA","ASorSS","start","end","fiveDA","threeDA","DA","NT"),sep="##") -> d - d$fiveDA=NULL - d$threeDA=NULL - x=list() - x[["table"]]=d - x[["samplelist"]]=samplelist - x[["circRNAlist"]]=unique(d$circRNA) - return(x) -} -filter_table_by_cname <-function(d,circRNA){ - k=d$circRNA==circRNA - d=d[k,] - return(d) -} -filter_table_by_sname <-function(d,samplename){ - d=data.frame(DA=d$DA,counts=d[[samplename]]) - d=d[order(d$counts,decreasing = TRUE),] - d=d[!(d$counts==0),] - return(d) -} -add_perc_sign <- function(s){ - result="" - if (as.character(s)!=""){ - result=paste0(as.character(s),"%") - } - return(result) -} - -create_perc_table<-function(df){ - table_percent <- df %>% mutate(DA=DA,perc = round((counts/ sum(counts)) * 100, 1)) %>% - mutate(DA=DA,labels=perc,y_text=cumsum(perc)-perc/2) - nr=nrow(table_percent) - if (nr>4){ - table_percent[5:nrow(table_percent),][["labels"]]="" - } - # min_labels=min(5,nrow(table_percent)) - # if (sum(!table_percent$labels<20)0){ - # table_percent[table_percent$labels<20,][["labels"]]="" - # } - # } - table_percent[table_percent$labels!="",]$labels=paste(table_percent[table_percent$labels!="",]$DA,table_percent[table_percent$labels!="",]$labels) - table_percent$labels=lapply(table_percent$labels,add_perc_sign) - table_percent[table_percent$labels!="",]$labels=paste0(table_percent[table_percent$labels!="",]$labels,"(",table_percent[table_percent$labels!="",]$counts,")") - return(table_percent) -} - - -args = commandArgs(trailingOnly=TRUE) -args[1] = "mergedquant.filtered.tsv" -d=read_data_table(args[1]) -samplelist=d$samplelist -circRNAlist=d$circRNAlist -d=d$table - -for(i in 1:length(circRNAlist)) { - cname=circRNAlist[[i]][1] - for (j in 1:length(samplelist)){ - sname=samplelist[[j]][1] - df=filter_table_by_sname(filter_table_by_cname(d,cname),sname) - if(nrow(df)>0){ - df=create_perc_table(df) - fname=paste0(cname,"-",sname,".png") - # print(fname) - png(fname) - # print(ggplot(df, aes(x = "", y = perc, fill = DA)) + - # geom_bar(width = 1,stat = "identity") + - # geom_label_repel(aes(label = labels, y = y_text)) + - # scale_fill_manual(values=sample(col_vector,100,replace=TRUE))+ - # coord_polar(theta = "y", start = 0) + - # theme_light()+ - # theme(legend.position="none")+ - # labs(x = "", y = "", title = paste(cname,sname,sep="-"))) - pie(df$perc,labels = df$labels,main= paste0(cname,"-",sname)) - dev.off() - } - } -} - diff --git a/workflow/scripts/run_wrapper.bash b/workflow/scripts/run_wrapper.bash deleted file mode 100644 index 155e223..0000000 --- a/workflow/scripts/run_wrapper.bash +++ /dev/null @@ -1,268 +0,0 @@ -#!/usr/bin/env bash -# Author: Vishal Koparde, Ph.D. -# CCBR, NCI -# (c) 2021 -# -# wrapper script to run the TOBIAS snakemake workflow -# run tobias -# https://github.com/loosolab/tobias/ -# ## clone the pipeline to a folder -# ## git clone https://github.com/loosolab/TOBIAS.git - -set -eo pipefail -module purge - -SINGULARITY_BINDS="-B ${PIPELINE_HOME}:${PIPELINE_HOME} -B ${WORKDIR}:${WORKDIR}" - -function get_git_commitid_tag() { - cd $1 - gid=$(git rev-parse HEAD) - tag=$(git describe --tags $gid 2>/dev/null) - echo -ne "$gid\t$tag" -} - -# ## setting PIPELINE_HOME -PIPELINE_HOME=$(readlink -f $(dirname "$0")) -echo "Pipeline Dir: $PIPELINE_HOME" -SNAKEFILE="${PIPELINE_HOME}/Snakefile" -# get github commit tag -GIT_COMMIT_TAG=$(get_git_commitid_tag $PIPELINE_HOME) -echo "Git Commit/Tag: $GIT_COMMIT_TAG" - -function usage() { cat << EOF -run_tobias.sh: run TOBIAS for ATAC seq data -USAGE: - bash run_tobias.sh -Required Positional Argument: - MODE: [Type: Str] Valid options: - a) init : initialize workdir - b) run : run with slurm - c) reset : DELETE workdir dir and re-init it - e) dryrun : dry run snakemake to generate DAG - f) unlock : unlock workdir if locked by snakemake - g) runlocal : run without submitting to sbatch -EOF -} - -function err() { cat <<< " -# -# -# - $@ -# -# -# -" && usage && exit 1 1>&2; } - -function init() { - -if [ "$#" -eq "1" ]; then err "init needs an absolute path to the working dir"; fi -if [ "$#" -gt "2" ]; then err "init takes only one more argument"; fi -WORKDIR=$2 -x=$(echo $WORKDIR|awk '{print substr($1,1,1)}') -if [ "$x" != "/" ]; then err "working dir should be supplied as an absolute path"; fi -echo "Working Dir: $WORKDIR" -if [ -d $WORKDIR ];then err "Folder $WORKDIR already exists!"; exit 1; fi -mkdir -p $WORKDIR -sed -e "s/PIPELINE_HOME/${PIPELINE_HOME//\//\\/}/g" -e "s/WORKDIR/${WORKDIR//\//\\/}/g" ${PIPELINE_HOME}/config/config.yaml > $WORKDIR/config.yaml - -#create log and stats folders -if [ ! -d $WORKDIR/logs ]; then mkdir -p $WORKDIR/logs;echo "Logs Dir: $WORKDIR/logs";fi -if [ ! -d $WORKDIR/stats ];then mkdir -p $WORKDIR/stats;echo "Stats Dir: $WORKDIR/stats";fi - -echo "Done Initializing $WORKDIR. You can now edit $WORKDIR/config.yaml and $WORKDIR/samples.tsv" - -} - -function runcheck(){ - if [ "$#" -eq "1" ]; then err "absolute path to the working dir needed"; usage; exit 1; fi - if [ "$#" -gt "2" ]; then err "too many arguments"; usage; exit 1; fi - WORKDIR=$2 - echo "Working Dir: $WORKDIR" - if [ ! -d $WORKDIR ];then err "Folder $WORKDIR does not exist!"; exit 1; fi - module load python/3.7 - module load snakemake/5.24.1 -} - -function dryrun() { - runcheck "$@" - run "--dry-run" -} - -function unlock() { - runcheck "$@" - run "--unlock" -} - -function runlocal() { - runcheck "$@" - if [ "$SLURM_JOB_ID" == "" ];then err "runlocal can only be done on an interactive node"; exit 1; fi - module load singularity - run "local" -} - -function runslurm() { - runcheck "$@" - run "slurm" -} - -function preruncleanup() { - echo "Running..." - - cd $WORKDIR - ## check if initialized - for f in config.yaml samples.tsv; do - if [ ! -f $WORKDIR/$f ]; then err "Error: '${f}' file not found in workdir ... initialize first!";usage && exit 1;fi - done - ## Archive previous run files - if [ -f ${WORKDIR}/snakemake.log ];then - modtime=$(stat ${WORKDIR}/snakemake.log |grep Modify|awk '{print $2,$3}'|awk -F"." '{print $1}'|sed "s/ //g"|sed "s/-//g"|sed "s/://g") - mv ${WORKDIR}/snakemake.log ${WORKDIR}/stats/snakemake.${modtime}.log - if [ -f ${WORKDIR}/snakemake.log.HPC_summary.txt ];then - mv ${WORKDIR}/snakemake.log.HPC_summary.txt ${WORKDIR}/stats/snakemake.${modtime}.log.HPC_summary.txt - fi - if [ -f ${WORKDIR}/snakemake.stats ];then - mv ${WORKDIR}/snakemake.stats ${WORKDIR}/stats/snakemake.${modtime}.stats - fi - fi - nslurmouts=$(find ${WORKDIR} -maxdepth 1 -name "slurm-*.out" |wc -l) - if [ "$nslurmouts" != "0" ];then - for f in $(ls ${WORKDIR}/slurm-*.out);do gzip -n $f;mv ${f}.gz ${WORKDIR}/logs/;done - fi - -} - -function postrun() { - bash ${PIPELINE_HOME}/scripts/gather_cluster_stats.sh ${WORKDIR}/snakemake.log > ${WORKDIR}/snakemake.log.HPC_summary.txt -} - -function run() { - - - if [ "$1" == "local" ];then - - preruncleanup - - snakemake -s ${PIPELINE_HOME}/$SNAKEFILE \ - --directory $WORKDIR \ - --printshellcmds \ - --use-singularity \ - --singularity-args $SINGULARITY_BINDS \ - --use-envmodules \ - --latency-wait 120 \ - --configfile ${WORKDIR}/config.yaml \ - --cores all \ - --stats ${WORKDIR}/snakemake.stats \ - 2>&1|tee ${WORKDIR}/snakemake.log - - if [ "$?" -eq "0" ];then - snakemake -s ${PIPELINE_HOME}/$SNAKEFILE \ - --report ${WORKDIR}/runlocal_snakemake_report.html \ - --directory $WORKDIR \ - --configfile ${WORKDIR}/config.yaml - fi - - postrun - - elif [ "$1" == "slurm" ];then - - preruncleanup - - cat > ${WORKDIR}/submit_script.sbatch << EOF -#!/bin/bash -#SBATCH --job-name="insert_jobname_here" -#SBATCH --mem=10g -#SBATCH --partition="ccr,norm" -#SBATCH --time=96:00:00 -#SBATCH --cpus-per-task=2 - -module load python/3.7 -module load snakemake/5.24.1 -module load singularity - -cd \$SLURM_SUBMIT_DIR - -snakemake -s $SNAKEFILE \ ---directory $WORKDIR \ ---use-singularity \ ---singularity-args $SINGULARITY_BINDS \ ---use-envmodules \ ---printshellcmds \ ---latency-wait 120 \ ---configfile ${WORKDIR}/config.yaml \ ---cluster-config ${PIPELINE_HOME}/config/cluster.json \ ---cluster "sbatch --gres {cluster.gres} --cpus-per-task {cluster.threads} -p {cluster.partition} -t {cluster.time} --mem {cluster.mem} --job-name {cluster.name} --output {cluster.output} --error {cluster.error}" \ --j 500 \ ---rerun-incomplete \ ---keep-going \ ---stats ${WORKDIR}/snakemake.stats \ -2>&1|tee ${WORKDIR}/snakemake.log - -if [ "\$?" -eq "0" ];then - snakemake -s $SNAKEFILE \ - --directory $WORKDIR \ - --report ${WORKDIR}/runslurm_snakemake_report.html \ - --configfile ${WORKDIR}/config.yaml -fi - -bash ${PIPELINE_HOME}/scripts/gather_cluster_stats.sh ${WORKDIR}/snakemake.log > ${WORKDIR}/snakemake.log.HPC_summary.txt - -EOF - - sbatch ${WORKDIR}/submit_script.sbatch - - else - -snakemake $1 -s ${SNAKEFILE} \ ---directory $WORKDIR \ ---use-envmodules \ ---printshellcmds \ ---latency-wait 120 \ ---configfile ${WORKDIR}/config.yaml \ ---cluster-config ${PIPELINE_HOME}/config/cluster.json \ ---cluster "sbatch --gres {cluster.gres} --cpus-per-task {cluster.threads} -p {cluster.partition} -t {cluster.time} --mem {cluster.mem} --job-name {cluster.name} --output {cluster.output} --error {cluster.error}" \ --j 500 \ ---rerun-incomplete \ ---keep-going \ ---stats ${WORKDIR}/snakemake.stats - - fi - -} - -function reset() { -if [ "$#" -eq "1" ]; then err "cleanup needs an absolute path to the existing working dir"; usage; fi -if [ "$#" -gt "2" ]; then err "cleanup takes only one more argument"; usage; fi -WORKDIR=$2 -echo "Working Dir: $WORKDIR" -if [ ! -d $WORKDIR ];then err "Folder $WORKDIR does not exist!";fi -echo "Deleting $WORKDIR" -rm -rf $WORKDIR -echo "Re-Initializing $WORKDIR" -init "$@" -} - - -function main(){ - - if [ $# -eq 0 ]; then usage; exit 1; fi - - case $1 in - init) init "$@" && exit 0;; - dryrun) dryrun "$@" && exit 0;; - unlock) unlock "$@" && exit 0;; - run) runslurm "$@" && exit 0;; - runlocal) runlocal "$@" && exit 0;; - reset) reset "$@" && exit 0;; - -h | --help | help) usage && exit 0;; - -* | --*) err "Error: Failed to provide mode: ."; usage && exit 1;; - *) err "Error: Failed to provide mode: . '${1}' is not supported."; usage && exit 1;; - esac -} - -main "$@" - - - - -